llama-model.cpp 883 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558855985608561856285638564856585668567856885698570857185728573857485758576857785788579858085818582858385848585858685878588858985908591859285938594859585968597859885998600860186028603860486058606860786088609861086118612861386148615861686178618861986208621862286238624862586268627862886298630863186328633863486358636863786388639864086418642864386448645864686478648864986508651865286538654865586568657865886598660866186628663866486658666866786688669867086718672867386748675867686778678867986808681868286838684868586868687868886898690869186928693869486958696869786988699870087018702870387048705870687078708870987108711871287138714871587168717871887198720872187228723872487258726872787288729873087318732873387348735873687378738873987408741874287438744874587468747874887498750875187528753875487558756875787588759876087618762876387648765876687678768876987708771877287738774877587768777877887798780878187828783878487858786878787888789879087918792879387948795879687978798879988008801880288038804880588068807880888098810881188128813881488158816881788188819882088218822882388248825882688278828882988308831883288338834883588368837883888398840884188428843884488458846884788488849885088518852885388548855885688578858885988608861886288638864886588668867886888698870887188728873887488758876887788788879888088818882888388848885888688878888888988908891889288938894889588968897889888998900890189028903890489058906890789088909891089118912891389148915891689178918891989208921892289238924892589268927892889298930893189328933893489358936893789388939894089418942894389448945894689478948894989508951895289538954895589568957895889598960896189628963896489658966896789688969897089718972897389748975897689778978897989808981898289838984898589868987898889898990899189928993899489958996899789988999900090019002900390049005900690079008900990109011901290139014901590169017901890199020902190229023902490259026902790289029903090319032903390349035903690379038903990409041904290439044904590469047904890499050905190529053905490559056905790589059906090619062906390649065906690679068906990709071907290739074907590769077907890799080908190829083908490859086908790889089909090919092909390949095909690979098909991009101910291039104910591069107910891099110911191129113911491159116911791189119912091219122912391249125912691279128912991309131913291339134913591369137913891399140914191429143914491459146914791489149915091519152915391549155915691579158915991609161916291639164916591669167916891699170917191729173917491759176917791789179918091819182918391849185918691879188918991909191919291939194919591969197919891999200920192029203920492059206920792089209921092119212921392149215921692179218921992209221922292239224922592269227922892299230923192329233923492359236923792389239924092419242924392449245924692479248924992509251925292539254925592569257925892599260926192629263926492659266926792689269927092719272927392749275927692779278927992809281928292839284928592869287928892899290929192929293929492959296929792989299930093019302930393049305930693079308930993109311931293139314931593169317931893199320932193229323932493259326932793289329933093319332933393349335933693379338933993409341934293439344934593469347934893499350935193529353935493559356935793589359936093619362936393649365936693679368936993709371937293739374937593769377937893799380938193829383938493859386938793889389939093919392939393949395939693979398939994009401940294039404940594069407940894099410941194129413941494159416941794189419942094219422942394249425942694279428942994309431943294339434943594369437943894399440944194429443944494459446944794489449945094519452945394549455945694579458945994609461946294639464946594669467946894699470947194729473947494759476947794789479948094819482948394849485948694879488948994909491949294939494949594969497949894999500950195029503950495059506950795089509951095119512951395149515951695179518951995209521952295239524952595269527952895299530953195329533953495359536953795389539954095419542954395449545954695479548954995509551955295539554955595569557955895599560956195629563956495659566956795689569957095719572957395749575957695779578957995809581958295839584958595869587958895899590959195929593959495959596959795989599960096019602960396049605960696079608960996109611961296139614961596169617961896199620962196229623962496259626962796289629963096319632963396349635963696379638963996409641964296439644964596469647964896499650965196529653965496559656965796589659966096619662966396649665966696679668966996709671967296739674967596769677967896799680968196829683968496859686968796889689969096919692969396949695969696979698969997009701970297039704970597069707970897099710971197129713971497159716971797189719972097219722972397249725972697279728972997309731973297339734973597369737973897399740974197429743974497459746974797489749975097519752975397549755975697579758975997609761976297639764976597669767976897699770977197729773977497759776977797789779978097819782978397849785978697879788978997909791979297939794979597969797979897999800980198029803980498059806980798089809981098119812981398149815981698179818981998209821982298239824982598269827982898299830983198329833983498359836983798389839984098419842984398449845984698479848984998509851985298539854985598569857985898599860986198629863986498659866986798689869987098719872987398749875987698779878987998809881988298839884988598869887988898899890989198929893989498959896989798989899990099019902990399049905990699079908990999109911991299139914991599169917991899199920992199229923992499259926992799289929993099319932993399349935993699379938993999409941994299439944994599469947994899499950995199529953995499559956995799589959996099619962996399649965996699679968996999709971997299739974997599769977997899799980998199829983998499859986998799889989999099919992999399949995999699979998999910000100011000210003100041000510006100071000810009100101001110012100131001410015100161001710018100191002010021100221002310024100251002610027100281002910030100311003210033100341003510036100371003810039100401004110042100431004410045100461004710048100491005010051100521005310054100551005610057100581005910060100611006210063100641006510066100671006810069100701007110072100731007410075100761007710078100791008010081100821008310084100851008610087100881008910090100911009210093100941009510096100971009810099101001010110102101031010410105101061010710108101091011010111101121011310114101151011610117101181011910120101211012210123101241012510126101271012810129101301013110132101331013410135101361013710138101391014010141101421014310144101451014610147101481014910150101511015210153101541015510156101571015810159101601016110162101631016410165101661016710168101691017010171101721017310174101751017610177101781017910180101811018210183101841018510186101871018810189101901019110192101931019410195101961019710198101991020010201102021020310204102051020610207102081020910210102111021210213102141021510216102171021810219102201022110222102231022410225102261022710228102291023010231102321023310234102351023610237102381023910240102411024210243102441024510246102471024810249102501025110252102531025410255102561025710258102591026010261102621026310264102651026610267102681026910270102711027210273102741027510276102771027810279102801028110282102831028410285102861028710288102891029010291102921029310294102951029610297102981029910300103011030210303103041030510306103071030810309103101031110312103131031410315103161031710318103191032010321103221032310324103251032610327103281032910330103311033210333103341033510336103371033810339103401034110342103431034410345103461034710348103491035010351103521035310354103551035610357103581035910360103611036210363103641036510366103671036810369103701037110372103731037410375103761037710378103791038010381103821038310384103851038610387103881038910390103911039210393103941039510396103971039810399104001040110402104031040410405104061040710408104091041010411104121041310414104151041610417104181041910420104211042210423104241042510426104271042810429104301043110432104331043410435104361043710438104391044010441104421044310444104451044610447104481044910450104511045210453104541045510456104571045810459104601046110462104631046410465104661046710468104691047010471104721047310474104751047610477104781047910480104811048210483104841048510486104871048810489104901049110492104931049410495104961049710498104991050010501105021050310504105051050610507105081050910510105111051210513105141051510516105171051810519105201052110522105231052410525105261052710528105291053010531105321053310534105351053610537105381053910540105411054210543105441054510546105471054810549105501055110552105531055410555105561055710558105591056010561105621056310564105651056610567105681056910570105711057210573105741057510576105771057810579105801058110582105831058410585105861058710588105891059010591105921059310594105951059610597105981059910600106011060210603106041060510606106071060810609106101061110612106131061410615106161061710618106191062010621106221062310624106251062610627106281062910630106311063210633106341063510636106371063810639106401064110642106431064410645106461064710648106491065010651106521065310654106551065610657106581065910660106611066210663106641066510666106671066810669106701067110672106731067410675106761067710678106791068010681106821068310684106851068610687106881068910690106911069210693106941069510696106971069810699107001070110702107031070410705107061070710708107091071010711107121071310714107151071610717107181071910720107211072210723107241072510726107271072810729107301073110732107331073410735107361073710738107391074010741107421074310744107451074610747107481074910750107511075210753107541075510756107571075810759107601076110762107631076410765107661076710768107691077010771107721077310774107751077610777107781077910780107811078210783107841078510786107871078810789107901079110792107931079410795107961079710798107991080010801108021080310804108051080610807108081080910810108111081210813108141081510816108171081810819108201082110822108231082410825108261082710828108291083010831108321083310834108351083610837108381083910840108411084210843108441084510846108471084810849108501085110852108531085410855108561085710858108591086010861108621086310864108651086610867108681086910870108711087210873108741087510876108771087810879108801088110882108831088410885108861088710888108891089010891108921089310894108951089610897108981089910900109011090210903109041090510906109071090810909109101091110912109131091410915109161091710918109191092010921109221092310924109251092610927109281092910930109311093210933109341093510936109371093810939109401094110942109431094410945109461094710948109491095010951109521095310954109551095610957109581095910960109611096210963109641096510966109671096810969109701097110972109731097410975109761097710978109791098010981109821098310984109851098610987109881098910990109911099210993109941099510996109971099810999110001100111002110031100411005110061100711008110091101011011110121101311014110151101611017110181101911020110211102211023110241102511026110271102811029110301103111032110331103411035110361103711038110391104011041110421104311044110451104611047110481104911050110511105211053110541105511056110571105811059110601106111062110631106411065110661106711068110691107011071110721107311074110751107611077110781107911080110811108211083110841108511086110871108811089110901109111092110931109411095110961109711098110991110011101111021110311104111051110611107111081110911110111111111211113111141111511116111171111811119111201112111122111231112411125111261112711128111291113011131111321113311134111351113611137111381113911140111411114211143111441114511146111471114811149111501115111152111531115411155111561115711158111591116011161111621116311164111651116611167111681116911170111711117211173111741117511176111771117811179111801118111182111831118411185111861118711188111891119011191111921119311194111951119611197111981119911200112011120211203112041120511206112071120811209112101121111212112131121411215112161121711218112191122011221112221122311224112251122611227112281122911230112311123211233112341123511236112371123811239112401124111242112431124411245112461124711248112491125011251112521125311254112551125611257112581125911260112611126211263112641126511266112671126811269112701127111272112731127411275112761127711278112791128011281112821128311284112851128611287112881128911290112911129211293112941129511296112971129811299113001130111302113031130411305113061130711308113091131011311113121131311314113151131611317113181131911320113211132211323113241132511326113271132811329113301133111332113331133411335113361133711338113391134011341113421134311344113451134611347113481134911350113511135211353113541135511356113571135811359113601136111362113631136411365113661136711368113691137011371113721137311374113751137611377113781137911380113811138211383113841138511386113871138811389113901139111392113931139411395113961139711398113991140011401114021140311404114051140611407114081140911410114111141211413114141141511416114171141811419114201142111422114231142411425114261142711428114291143011431114321143311434114351143611437114381143911440114411144211443114441144511446114471144811449114501145111452114531145411455114561145711458114591146011461114621146311464114651146611467114681146911470114711147211473114741147511476114771147811479114801148111482114831148411485114861148711488114891149011491114921149311494114951149611497114981149911500115011150211503115041150511506115071150811509115101151111512115131151411515115161151711518115191152011521115221152311524115251152611527115281152911530115311153211533115341153511536115371153811539115401154111542115431154411545115461154711548115491155011551115521155311554115551155611557115581155911560115611156211563115641156511566115671156811569115701157111572115731157411575115761157711578115791158011581115821158311584115851158611587115881158911590115911159211593115941159511596115971159811599116001160111602116031160411605116061160711608116091161011611116121161311614116151161611617116181161911620116211162211623116241162511626116271162811629116301163111632116331163411635116361163711638116391164011641116421164311644116451164611647116481164911650116511165211653116541165511656116571165811659116601166111662116631166411665116661166711668116691167011671116721167311674116751167611677116781167911680116811168211683116841168511686116871168811689116901169111692116931169411695116961169711698116991170011701117021170311704117051170611707117081170911710117111171211713117141171511716117171171811719117201172111722117231172411725117261172711728117291173011731117321173311734117351173611737117381173911740117411174211743117441174511746117471174811749117501175111752117531175411755117561175711758117591176011761117621176311764117651176611767117681176911770117711177211773117741177511776117771177811779117801178111782117831178411785117861178711788117891179011791117921179311794117951179611797117981179911800118011180211803118041180511806118071180811809118101181111812118131181411815118161181711818118191182011821118221182311824118251182611827118281182911830118311183211833118341183511836118371183811839118401184111842118431184411845118461184711848118491185011851118521185311854118551185611857118581185911860118611186211863118641186511866118671186811869118701187111872118731187411875118761187711878118791188011881118821188311884118851188611887118881188911890118911189211893118941189511896118971189811899119001190111902119031190411905119061190711908119091191011911119121191311914119151191611917119181191911920119211192211923119241192511926119271192811929119301193111932119331193411935119361193711938119391194011941119421194311944119451194611947119481194911950119511195211953119541195511956119571195811959119601196111962119631196411965119661196711968119691197011971119721197311974119751197611977119781197911980119811198211983119841198511986119871198811989119901199111992119931199411995119961199711998119991200012001120021200312004120051200612007120081200912010120111201212013120141201512016120171201812019120201202112022120231202412025120261202712028120291203012031120321203312034120351203612037120381203912040120411204212043120441204512046120471204812049120501205112052120531205412055120561205712058120591206012061120621206312064120651206612067120681206912070120711207212073120741207512076120771207812079120801208112082120831208412085120861208712088120891209012091120921209312094120951209612097120981209912100121011210212103121041210512106121071210812109121101211112112121131211412115121161211712118121191212012121121221212312124121251212612127121281212912130121311213212133121341213512136121371213812139121401214112142121431214412145121461214712148121491215012151121521215312154121551215612157121581215912160121611216212163121641216512166121671216812169121701217112172121731217412175121761217712178121791218012181121821218312184121851218612187121881218912190121911219212193121941219512196121971219812199122001220112202122031220412205122061220712208122091221012211122121221312214122151221612217122181221912220122211222212223122241222512226122271222812229122301223112232122331223412235122361223712238122391224012241122421224312244122451224612247122481224912250122511225212253122541225512256122571225812259122601226112262122631226412265122661226712268122691227012271122721227312274122751227612277122781227912280122811228212283122841228512286122871228812289122901229112292122931229412295122961229712298122991230012301123021230312304123051230612307123081230912310123111231212313123141231512316123171231812319123201232112322123231232412325123261232712328123291233012331123321233312334123351233612337123381233912340123411234212343123441234512346123471234812349123501235112352123531235412355123561235712358123591236012361123621236312364123651236612367123681236912370123711237212373123741237512376123771237812379123801238112382123831238412385123861238712388123891239012391123921239312394123951239612397123981239912400124011240212403124041240512406124071240812409124101241112412124131241412415124161241712418124191242012421124221242312424124251242612427124281242912430124311243212433124341243512436124371243812439124401244112442124431244412445124461244712448124491245012451124521245312454124551245612457124581245912460124611246212463124641246512466124671246812469124701247112472124731247412475124761247712478124791248012481124821248312484124851248612487124881248912490124911249212493124941249512496124971249812499125001250112502125031250412505125061250712508125091251012511125121251312514125151251612517125181251912520125211252212523125241252512526125271252812529125301253112532125331253412535125361253712538125391254012541125421254312544125451254612547125481254912550125511255212553125541255512556125571255812559125601256112562125631256412565125661256712568125691257012571125721257312574125751257612577125781257912580125811258212583125841258512586125871258812589125901259112592125931259412595125961259712598125991260012601126021260312604126051260612607126081260912610126111261212613126141261512616126171261812619126201262112622126231262412625126261262712628126291263012631126321263312634126351263612637126381263912640126411264212643126441264512646126471264812649126501265112652126531265412655126561265712658126591266012661126621266312664126651266612667126681266912670126711267212673126741267512676126771267812679126801268112682126831268412685126861268712688126891269012691126921269312694126951269612697126981269912700127011270212703127041270512706127071270812709127101271112712127131271412715127161271712718127191272012721127221272312724127251272612727127281272912730127311273212733127341273512736127371273812739127401274112742127431274412745127461274712748127491275012751127521275312754127551275612757127581275912760127611276212763127641276512766127671276812769127701277112772127731277412775127761277712778127791278012781127821278312784127851278612787127881278912790127911279212793127941279512796127971279812799128001280112802128031280412805128061280712808128091281012811128121281312814128151281612817128181281912820128211282212823128241282512826128271282812829128301283112832128331283412835128361283712838128391284012841128421284312844128451284612847128481284912850128511285212853128541285512856128571285812859128601286112862128631286412865128661286712868128691287012871128721287312874128751287612877128781287912880128811288212883128841288512886128871288812889128901289112892128931289412895128961289712898128991290012901129021290312904129051290612907129081290912910129111291212913129141291512916129171291812919129201292112922129231292412925129261292712928129291293012931129321293312934129351293612937129381293912940129411294212943129441294512946129471294812949129501295112952129531295412955129561295712958129591296012961129621296312964129651296612967129681296912970129711297212973129741297512976129771297812979129801298112982129831298412985129861298712988129891299012991129921299312994129951299612997129981299913000130011300213003130041300513006130071300813009130101301113012130131301413015130161301713018130191302013021130221302313024130251302613027130281302913030130311303213033130341303513036130371303813039130401304113042130431304413045130461304713048130491305013051130521305313054130551305613057130581305913060130611306213063130641306513066130671306813069130701307113072130731307413075130761307713078130791308013081130821308313084130851308613087130881308913090130911309213093130941309513096130971309813099131001310113102131031310413105131061310713108131091311013111131121311313114131151311613117131181311913120131211312213123131241312513126131271312813129131301313113132131331313413135131361313713138131391314013141131421314313144131451314613147131481314913150131511315213153131541315513156131571315813159131601316113162131631316413165131661316713168131691317013171131721317313174131751317613177131781317913180131811318213183131841318513186131871318813189131901319113192131931319413195131961319713198131991320013201132021320313204132051320613207132081320913210132111321213213132141321513216132171321813219132201322113222132231322413225132261322713228132291323013231132321323313234132351323613237132381323913240132411324213243132441324513246132471324813249132501325113252132531325413255132561325713258132591326013261132621326313264132651326613267132681326913270132711327213273132741327513276132771327813279132801328113282132831328413285132861328713288132891329013291132921329313294132951329613297132981329913300133011330213303133041330513306133071330813309133101331113312133131331413315133161331713318133191332013321133221332313324133251332613327133281332913330133311333213333133341333513336133371333813339133401334113342133431334413345133461334713348133491335013351133521335313354133551335613357133581335913360133611336213363133641336513366133671336813369133701337113372133731337413375133761337713378133791338013381133821338313384133851338613387133881338913390133911339213393133941339513396133971339813399134001340113402134031340413405134061340713408134091341013411134121341313414134151341613417134181341913420134211342213423134241342513426134271342813429134301343113432134331343413435134361343713438134391344013441134421344313444134451344613447134481344913450134511345213453134541345513456134571345813459134601346113462134631346413465134661346713468134691347013471134721347313474134751347613477134781347913480134811348213483134841348513486134871348813489134901349113492134931349413495134961349713498134991350013501135021350313504135051350613507135081350913510135111351213513135141351513516135171351813519135201352113522135231352413525135261352713528135291353013531135321353313534135351353613537135381353913540135411354213543135441354513546135471354813549135501355113552135531355413555135561355713558135591356013561135621356313564135651356613567135681356913570135711357213573135741357513576135771357813579135801358113582135831358413585135861358713588135891359013591135921359313594135951359613597135981359913600136011360213603136041360513606136071360813609136101361113612136131361413615136161361713618136191362013621136221362313624136251362613627136281362913630136311363213633136341363513636136371363813639136401364113642136431364413645136461364713648136491365013651136521365313654136551365613657136581365913660136611366213663136641366513666136671366813669136701367113672136731367413675136761367713678136791368013681136821368313684136851368613687136881368913690136911369213693136941369513696136971369813699137001370113702137031370413705137061370713708137091371013711137121371313714137151371613717137181371913720137211372213723137241372513726137271372813729137301373113732137331373413735137361373713738137391374013741137421374313744137451374613747137481374913750137511375213753137541375513756137571375813759137601376113762137631376413765137661376713768137691377013771137721377313774137751377613777137781377913780137811378213783137841378513786137871378813789137901379113792137931379413795137961379713798137991380013801138021380313804138051380613807138081380913810138111381213813138141381513816138171381813819138201382113822138231382413825138261382713828138291383013831138321383313834138351383613837138381383913840138411384213843138441384513846138471384813849138501385113852138531385413855138561385713858138591386013861138621386313864138651386613867138681386913870138711387213873138741387513876138771387813879138801388113882138831388413885138861388713888138891389013891138921389313894138951389613897138981389913900139011390213903139041390513906139071390813909139101391113912139131391413915139161391713918139191392013921139221392313924139251392613927139281392913930139311393213933139341393513936139371393813939139401394113942139431394413945139461394713948139491395013951139521395313954139551395613957139581395913960139611396213963139641396513966139671396813969139701397113972139731397413975139761397713978139791398013981139821398313984139851398613987139881398913990139911399213993139941399513996139971399813999140001400114002140031400414005140061400714008140091401014011140121401314014140151401614017140181401914020140211402214023140241402514026140271402814029140301403114032140331403414035140361403714038140391404014041140421404314044140451404614047140481404914050140511405214053140541405514056140571405814059140601406114062140631406414065140661406714068140691407014071140721407314074140751407614077140781407914080140811408214083140841408514086140871408814089140901409114092140931409414095140961409714098140991410014101141021410314104141051410614107141081410914110141111411214113141141411514116141171411814119141201412114122141231412414125141261412714128141291413014131141321413314134141351413614137141381413914140141411414214143141441414514146141471414814149141501415114152141531415414155141561415714158141591416014161141621416314164141651416614167141681416914170141711417214173141741417514176141771417814179141801418114182141831418414185141861418714188141891419014191141921419314194141951419614197141981419914200142011420214203142041420514206142071420814209142101421114212142131421414215142161421714218142191422014221142221422314224142251422614227142281422914230142311423214233142341423514236142371423814239142401424114242142431424414245142461424714248142491425014251142521425314254142551425614257142581425914260142611426214263142641426514266142671426814269142701427114272142731427414275142761427714278142791428014281142821428314284142851428614287142881428914290142911429214293142941429514296142971429814299143001430114302143031430414305143061430714308143091431014311143121431314314143151431614317143181431914320143211432214323143241432514326143271432814329143301433114332143331433414335143361433714338143391434014341143421434314344143451434614347143481434914350143511435214353143541435514356143571435814359143601436114362143631436414365143661436714368143691437014371143721437314374143751437614377143781437914380143811438214383143841438514386143871438814389143901439114392143931439414395143961439714398143991440014401144021440314404144051440614407144081440914410144111441214413144141441514416144171441814419144201442114422144231442414425144261442714428144291443014431144321443314434144351443614437144381443914440144411444214443144441444514446144471444814449144501445114452144531445414455144561445714458144591446014461144621446314464144651446614467144681446914470144711447214473144741447514476144771447814479144801448114482144831448414485144861448714488144891449014491144921449314494144951449614497144981449914500145011450214503145041450514506145071450814509145101451114512145131451414515145161451714518145191452014521145221452314524145251452614527145281452914530145311453214533145341453514536145371453814539145401454114542145431454414545145461454714548145491455014551145521455314554145551455614557145581455914560145611456214563145641456514566145671456814569145701457114572145731457414575145761457714578145791458014581145821458314584145851458614587145881458914590145911459214593145941459514596145971459814599146001460114602146031460414605146061460714608146091461014611146121461314614146151461614617146181461914620146211462214623146241462514626146271462814629146301463114632146331463414635146361463714638146391464014641146421464314644146451464614647146481464914650146511465214653146541465514656146571465814659146601466114662146631466414665146661466714668146691467014671146721467314674146751467614677146781467914680146811468214683146841468514686146871468814689146901469114692146931469414695146961469714698146991470014701147021470314704147051470614707147081470914710147111471214713147141471514716147171471814719147201472114722147231472414725147261472714728147291473014731147321473314734147351473614737147381473914740147411474214743147441474514746147471474814749147501475114752147531475414755147561475714758147591476014761147621476314764147651476614767147681476914770147711477214773147741477514776147771477814779147801478114782147831478414785147861478714788147891479014791147921479314794147951479614797147981479914800148011480214803148041480514806148071480814809148101481114812148131481414815148161481714818148191482014821148221482314824148251482614827148281482914830148311483214833148341483514836148371483814839148401484114842148431484414845148461484714848148491485014851148521485314854148551485614857148581485914860148611486214863148641486514866148671486814869148701487114872148731487414875148761487714878148791488014881148821488314884148851488614887148881488914890148911489214893148941489514896148971489814899149001490114902149031490414905149061490714908149091491014911149121491314914149151491614917149181491914920149211492214923149241492514926149271492814929149301493114932149331493414935149361493714938149391494014941149421494314944149451494614947149481494914950149511495214953149541495514956149571495814959149601496114962149631496414965149661496714968149691497014971149721497314974149751497614977149781497914980149811498214983149841498514986149871498814989149901499114992149931499414995149961499714998149991500015001150021500315004150051500615007150081500915010150111501215013150141501515016150171501815019150201502115022150231502415025150261502715028150291503015031150321503315034150351503615037150381503915040150411504215043150441504515046150471504815049150501505115052150531505415055150561505715058150591506015061150621506315064150651506615067150681506915070150711507215073150741507515076150771507815079150801508115082150831508415085150861508715088150891509015091150921509315094150951509615097150981509915100151011510215103151041510515106151071510815109151101511115112151131511415115151161511715118151191512015121151221512315124151251512615127151281512915130151311513215133151341513515136151371513815139151401514115142151431514415145151461514715148151491515015151151521515315154151551515615157151581515915160151611516215163151641516515166151671516815169151701517115172151731517415175151761517715178151791518015181151821518315184151851518615187151881518915190151911519215193151941519515196151971519815199152001520115202152031520415205152061520715208152091521015211152121521315214152151521615217152181521915220152211522215223152241522515226152271522815229152301523115232152331523415235152361523715238152391524015241152421524315244152451524615247152481524915250152511525215253152541525515256152571525815259152601526115262152631526415265152661526715268152691527015271152721527315274152751527615277152781527915280152811528215283152841528515286152871528815289152901529115292152931529415295152961529715298152991530015301153021530315304153051530615307153081530915310153111531215313153141531515316153171531815319153201532115322153231532415325153261532715328153291533015331153321533315334153351533615337153381533915340153411534215343153441534515346153471534815349153501535115352153531535415355153561535715358153591536015361153621536315364153651536615367153681536915370153711537215373153741537515376153771537815379153801538115382153831538415385153861538715388153891539015391153921539315394153951539615397153981539915400154011540215403154041540515406154071540815409154101541115412154131541415415154161541715418154191542015421154221542315424154251542615427154281542915430154311543215433154341543515436154371543815439154401544115442154431544415445154461544715448154491545015451154521545315454154551545615457154581545915460154611546215463154641546515466154671546815469154701547115472154731547415475154761547715478154791548015481154821548315484154851548615487154881548915490154911549215493154941549515496154971549815499155001550115502155031550415505155061550715508155091551015511155121551315514155151551615517155181551915520155211552215523155241552515526155271552815529155301553115532155331553415535155361553715538155391554015541155421554315544155451554615547155481554915550155511555215553155541555515556155571555815559155601556115562155631556415565155661556715568155691557015571155721557315574155751557615577155781557915580155811558215583155841558515586155871558815589155901559115592155931559415595155961559715598155991560015601156021560315604156051560615607156081560915610156111561215613156141561515616156171561815619156201562115622156231562415625156261562715628156291563015631156321563315634156351563615637156381563915640156411564215643156441564515646156471564815649156501565115652156531565415655156561565715658156591566015661156621566315664156651566615667156681566915670156711567215673156741567515676156771567815679156801568115682156831568415685156861568715688156891569015691156921569315694156951569615697156981569915700157011570215703157041570515706157071570815709157101571115712157131571415715157161571715718157191572015721157221572315724157251572615727157281572915730157311573215733157341573515736157371573815739157401574115742157431574415745157461574715748157491575015751157521575315754157551575615757157581575915760157611576215763157641576515766157671576815769157701577115772157731577415775157761577715778157791578015781157821578315784157851578615787157881578915790157911579215793157941579515796157971579815799158001580115802158031580415805158061580715808158091581015811158121581315814158151581615817158181581915820158211582215823158241582515826158271582815829158301583115832158331583415835158361583715838158391584015841158421584315844158451584615847158481584915850158511585215853158541585515856158571585815859158601586115862158631586415865158661586715868158691587015871158721587315874158751587615877158781587915880158811588215883158841588515886158871588815889158901589115892158931589415895158961589715898158991590015901159021590315904159051590615907159081590915910159111591215913159141591515916159171591815919159201592115922159231592415925159261592715928159291593015931159321593315934159351593615937159381593915940159411594215943159441594515946159471594815949159501595115952159531595415955159561595715958159591596015961159621596315964159651596615967159681596915970159711597215973159741597515976159771597815979159801598115982159831598415985159861598715988159891599015991159921599315994159951599615997159981599916000160011600216003160041600516006160071600816009160101601116012160131601416015160161601716018160191602016021160221602316024160251602616027160281602916030160311603216033160341603516036160371603816039160401604116042160431604416045160461604716048160491605016051160521605316054160551605616057160581605916060160611606216063160641606516066160671606816069160701607116072160731607416075160761607716078160791608016081160821608316084160851608616087160881608916090160911609216093160941609516096160971609816099161001610116102161031610416105161061610716108161091611016111161121611316114161151611616117161181611916120161211612216123161241612516126161271612816129161301613116132161331613416135161361613716138161391614016141161421614316144161451614616147161481614916150161511615216153161541615516156161571615816159161601616116162161631616416165161661616716168161691617016171161721617316174161751617616177161781617916180161811618216183161841618516186161871618816189161901619116192161931619416195161961619716198161991620016201162021620316204162051620616207162081620916210162111621216213162141621516216162171621816219162201622116222162231622416225162261622716228162291623016231162321623316234162351623616237162381623916240162411624216243162441624516246162471624816249162501625116252162531625416255162561625716258162591626016261162621626316264162651626616267162681626916270162711627216273162741627516276162771627816279162801628116282162831628416285162861628716288162891629016291162921629316294162951629616297162981629916300163011630216303163041630516306163071630816309163101631116312163131631416315163161631716318163191632016321163221632316324163251632616327163281632916330163311633216333163341633516336163371633816339163401634116342163431634416345163461634716348163491635016351163521635316354163551635616357163581635916360163611636216363163641636516366163671636816369163701637116372163731637416375163761637716378163791638016381163821638316384163851638616387163881638916390163911639216393163941639516396163971639816399164001640116402164031640416405164061640716408164091641016411164121641316414164151641616417164181641916420164211642216423164241642516426164271642816429164301643116432164331643416435164361643716438164391644016441164421644316444164451644616447164481644916450164511645216453164541645516456164571645816459164601646116462164631646416465164661646716468164691647016471164721647316474164751647616477164781647916480164811648216483164841648516486164871648816489164901649116492164931649416495164961649716498164991650016501165021650316504165051650616507165081650916510165111651216513165141651516516165171651816519165201652116522165231652416525165261652716528165291653016531165321653316534165351653616537165381653916540165411654216543165441654516546165471654816549165501655116552165531655416555165561655716558165591656016561165621656316564165651656616567165681656916570165711657216573165741657516576165771657816579165801658116582165831658416585165861658716588165891659016591165921659316594165951659616597165981659916600166011660216603166041660516606166071660816609166101661116612166131661416615166161661716618166191662016621166221662316624166251662616627166281662916630166311663216633166341663516636166371663816639166401664116642166431664416645166461664716648166491665016651166521665316654166551665616657166581665916660166611666216663166641666516666166671666816669166701667116672166731667416675166761667716678166791668016681166821668316684166851668616687166881668916690166911669216693166941669516696166971669816699167001670116702167031670416705167061670716708167091671016711167121671316714167151671616717167181671916720167211672216723167241672516726167271672816729167301673116732167331673416735167361673716738167391674016741167421674316744167451674616747167481674916750167511675216753167541675516756167571675816759167601676116762167631676416765167661676716768167691677016771167721677316774167751677616777167781677916780167811678216783167841678516786167871678816789167901679116792167931679416795167961679716798167991680016801168021680316804168051680616807168081680916810168111681216813168141681516816168171681816819168201682116822168231682416825168261682716828168291683016831168321683316834168351683616837168381683916840168411684216843168441684516846168471684816849168501685116852168531685416855168561685716858168591686016861168621686316864168651686616867168681686916870168711687216873168741687516876168771687816879168801688116882168831688416885168861688716888168891689016891168921689316894168951689616897168981689916900169011690216903169041690516906169071690816909169101691116912169131691416915169161691716918169191692016921169221692316924169251692616927169281692916930169311693216933169341693516936169371693816939169401694116942169431694416945169461694716948169491695016951169521695316954169551695616957169581695916960169611696216963169641696516966169671696816969169701697116972169731697416975169761697716978169791698016981169821698316984169851698616987169881698916990169911699216993169941699516996169971699816999170001700117002170031700417005170061700717008170091701017011170121701317014170151701617017170181701917020170211702217023170241702517026170271702817029170301703117032170331703417035170361703717038170391704017041170421704317044170451704617047170481704917050170511705217053170541705517056170571705817059170601706117062170631706417065170661706717068170691707017071170721707317074170751707617077170781707917080170811708217083170841708517086170871708817089170901709117092170931709417095170961709717098170991710017101171021710317104171051710617107171081710917110171111711217113171141711517116171171711817119171201712117122171231712417125171261712717128171291713017131171321713317134171351713617137171381713917140171411714217143171441714517146171471714817149171501715117152171531715417155171561715717158171591716017161171621716317164171651716617167171681716917170171711717217173171741717517176171771717817179171801718117182171831718417185171861718717188171891719017191171921719317194171951719617197171981719917200172011720217203172041720517206172071720817209172101721117212172131721417215172161721717218172191722017221172221722317224172251722617227172281722917230172311723217233172341723517236172371723817239172401724117242172431724417245172461724717248172491725017251172521725317254172551725617257172581725917260172611726217263172641726517266172671726817269172701727117272172731727417275172761727717278172791728017281172821728317284172851728617287172881728917290172911729217293172941729517296172971729817299173001730117302173031730417305173061730717308173091731017311173121731317314173151731617317173181731917320173211732217323173241732517326173271732817329173301733117332173331733417335173361733717338173391734017341173421734317344173451734617347173481734917350173511735217353173541735517356173571735817359173601736117362173631736417365173661736717368173691737017371173721737317374173751737617377173781737917380173811738217383173841738517386173871738817389173901739117392173931739417395173961739717398173991740017401174021740317404174051740617407174081740917410174111741217413174141741517416174171741817419174201742117422174231742417425174261742717428174291743017431174321743317434174351743617437174381743917440174411744217443174441744517446174471744817449174501745117452174531745417455174561745717458174591746017461174621746317464174651746617467174681746917470174711747217473174741747517476174771747817479174801748117482174831748417485174861748717488174891749017491174921749317494174951749617497174981749917500175011750217503175041750517506175071750817509175101751117512175131751417515175161751717518175191752017521175221752317524175251752617527175281752917530175311753217533175341753517536175371753817539175401754117542175431754417545175461754717548175491755017551175521755317554175551755617557175581755917560175611756217563175641756517566175671756817569175701757117572175731757417575175761757717578175791758017581175821758317584175851758617587175881758917590175911759217593175941759517596175971759817599176001760117602176031760417605176061760717608176091761017611176121761317614176151761617617176181761917620176211762217623176241762517626176271762817629176301763117632176331763417635176361763717638176391764017641176421764317644176451764617647176481764917650176511765217653176541765517656176571765817659176601766117662176631766417665176661766717668176691767017671176721767317674176751767617677176781767917680176811768217683176841768517686176871768817689176901769117692176931769417695176961769717698176991770017701177021770317704177051770617707177081770917710177111771217713177141771517716177171771817719177201772117722177231772417725177261772717728177291773017731177321773317734177351773617737177381773917740177411774217743177441774517746177471774817749177501775117752177531775417755177561775717758177591776017761177621776317764177651776617767177681776917770177711777217773177741777517776177771777817779177801778117782177831778417785177861778717788177891779017791177921779317794177951779617797177981779917800178011780217803178041780517806178071780817809178101781117812178131781417815178161781717818178191782017821178221782317824178251782617827178281782917830178311783217833178341783517836178371783817839178401784117842178431784417845178461784717848178491785017851178521785317854178551785617857178581785917860178611786217863178641786517866178671786817869178701787117872178731787417875178761787717878178791788017881178821788317884178851788617887178881788917890178911789217893178941789517896178971789817899179001790117902179031790417905179061790717908179091791017911179121791317914179151791617917179181791917920179211792217923179241792517926179271792817929179301793117932179331793417935179361793717938179391794017941179421794317944179451794617947179481794917950179511795217953179541795517956179571795817959179601796117962179631796417965179661796717968179691797017971179721797317974179751797617977179781797917980179811798217983179841798517986179871798817989179901799117992179931799417995179961799717998179991800018001180021800318004180051800618007180081800918010180111801218013180141801518016180171801818019180201802118022180231802418025180261802718028180291803018031180321803318034180351803618037180381803918040180411804218043180441804518046180471804818049180501805118052180531805418055180561805718058180591806018061180621806318064180651806618067180681806918070180711807218073180741807518076180771807818079180801808118082180831808418085180861808718088180891809018091180921809318094180951809618097180981809918100181011810218103181041810518106181071810818109181101811118112181131811418115181161811718118181191812018121181221812318124181251812618127181281812918130181311813218133181341813518136181371813818139181401814118142181431814418145181461814718148181491815018151181521815318154181551815618157181581815918160181611816218163181641816518166181671816818169181701817118172181731817418175181761817718178181791818018181181821818318184181851818618187181881818918190181911819218193181941819518196181971819818199182001820118202182031820418205182061820718208182091821018211182121821318214182151821618217182181821918220182211822218223182241822518226182271822818229182301823118232182331823418235182361823718238182391824018241182421824318244182451824618247182481824918250182511825218253182541825518256182571825818259182601826118262182631826418265182661826718268182691827018271182721827318274182751827618277182781827918280182811828218283182841828518286182871828818289182901829118292182931829418295182961829718298182991830018301183021830318304183051830618307183081830918310183111831218313183141831518316183171831818319183201832118322183231832418325183261832718328183291833018331183321833318334183351833618337183381833918340183411834218343183441834518346183471834818349183501835118352183531835418355183561835718358183591836018361183621836318364183651836618367183681836918370183711837218373183741837518376183771837818379183801838118382183831838418385183861838718388183891839018391183921839318394183951839618397183981839918400184011840218403184041840518406184071840818409184101841118412184131841418415184161841718418184191842018421184221842318424184251842618427184281842918430184311843218433184341843518436184371843818439184401844118442184431844418445184461844718448184491845018451184521845318454184551845618457184581845918460184611846218463184641846518466184671846818469184701847118472184731847418475184761847718478184791848018481184821848318484184851848618487184881848918490184911849218493184941849518496184971849818499185001850118502185031850418505185061850718508185091851018511185121851318514185151851618517185181851918520185211852218523185241852518526185271852818529185301853118532185331853418535185361853718538185391854018541185421854318544185451854618547185481854918550185511855218553185541855518556185571855818559185601856118562185631856418565185661856718568185691857018571185721857318574185751857618577185781857918580185811858218583185841858518586185871858818589185901859118592185931859418595185961859718598185991860018601186021860318604186051860618607186081860918610186111861218613186141861518616186171861818619186201862118622186231862418625186261862718628186291863018631186321863318634186351863618637186381863918640186411864218643186441864518646186471864818649186501865118652186531865418655186561865718658186591866018661186621866318664186651866618667186681866918670186711867218673186741867518676186771867818679186801868118682186831868418685186861868718688186891869018691186921869318694186951869618697186981869918700187011870218703187041870518706187071870818709187101871118712187131871418715187161871718718187191872018721187221872318724187251872618727187281872918730187311873218733187341873518736187371873818739187401874118742187431874418745187461874718748187491875018751187521875318754187551875618757187581875918760187611876218763187641876518766187671876818769187701877118772187731877418775187761877718778187791878018781187821878318784187851878618787187881878918790187911879218793187941879518796187971879818799188001880118802188031880418805188061880718808188091881018811188121881318814188151881618817188181881918820188211882218823188241882518826188271882818829188301883118832188331883418835188361883718838188391884018841188421884318844188451884618847188481884918850188511885218853188541885518856188571885818859188601886118862188631886418865188661886718868188691887018871188721887318874188751887618877188781887918880188811888218883188841888518886188871888818889188901889118892188931889418895188961889718898188991890018901189021890318904189051890618907189081890918910189111891218913189141891518916189171891818919189201892118922189231892418925189261892718928189291893018931189321893318934189351893618937189381893918940189411894218943189441894518946189471894818949189501895118952189531895418955189561895718958189591896018961189621896318964189651896618967189681896918970189711897218973189741897518976189771897818979189801898118982189831898418985189861898718988189891899018991189921899318994189951899618997189981899919000190011900219003190041900519006190071900819009190101901119012190131901419015190161901719018190191902019021190221902319024190251902619027190281902919030190311903219033190341903519036190371903819039190401904119042190431904419045190461904719048190491905019051190521905319054190551905619057190581905919060190611906219063190641906519066190671906819069190701907119072190731907419075190761907719078190791908019081190821908319084190851908619087190881908919090190911909219093190941909519096190971909819099191001910119102191031910419105191061910719108191091911019111191121911319114191151911619117191181911919120191211912219123191241912519126191271912819129191301913119132191331913419135191361913719138191391914019141191421914319144191451914619147191481914919150191511915219153191541915519156191571915819159191601916119162191631916419165191661916719168191691917019171191721917319174191751917619177191781917919180191811918219183191841918519186191871918819189191901919119192191931919419195191961919719198191991920019201192021920319204192051920619207192081920919210192111921219213192141921519216192171921819219192201922119222192231922419225192261922719228192291923019231192321923319234192351923619237192381923919240192411924219243192441924519246192471924819249192501925119252192531925419255192561925719258192591926019261192621926319264192651926619267192681926919270192711927219273192741927519276192771927819279192801928119282192831928419285192861928719288192891929019291192921929319294192951929619297192981929919300193011930219303193041930519306193071930819309193101931119312193131931419315193161931719318193191932019321193221932319324193251932619327193281932919330193311933219333193341933519336193371933819339193401934119342193431934419345193461934719348193491935019351193521935319354193551935619357193581935919360193611936219363193641936519366193671936819369193701937119372193731937419375193761937719378193791938019381193821938319384193851938619387193881938919390193911939219393193941939519396193971939819399194001940119402194031940419405194061940719408194091941019411194121941319414194151941619417194181941919420194211942219423194241942519426194271942819429194301943119432194331943419435194361943719438194391944019441194421944319444194451944619447194481944919450194511945219453194541945519456194571945819459194601946119462194631946419465194661946719468194691947019471194721947319474194751947619477194781947919480194811948219483194841948519486194871948819489194901949119492194931949419495194961949719498194991950019501195021950319504195051950619507195081950919510195111951219513195141951519516195171951819519195201952119522195231952419525195261952719528195291953019531195321953319534195351953619537195381953919540195411954219543195441954519546195471954819549195501955119552195531955419555195561955719558195591956019561195621956319564195651956619567195681956919570195711957219573195741957519576195771957819579195801958119582195831958419585195861958719588195891959019591195921959319594195951959619597195981959919600196011960219603196041960519606196071960819609196101961119612196131961419615196161961719618196191962019621196221962319624196251962619627196281962919630196311963219633196341963519636196371963819639196401964119642196431964419645196461964719648196491965019651196521965319654196551965619657196581965919660196611966219663196641966519666196671966819669196701967119672196731967419675196761967719678196791968019681196821968319684196851968619687196881968919690196911969219693196941969519696196971969819699197001970119702197031970419705197061970719708197091971019711197121971319714197151971619717197181971919720197211972219723197241972519726197271972819729197301973119732197331973419735197361973719738197391974019741197421974319744197451974619747197481974919750197511975219753197541975519756197571975819759197601976119762197631976419765197661976719768197691977019771197721977319774197751977619777197781977919780197811978219783197841978519786197871978819789197901979119792197931979419795197961979719798197991980019801198021980319804198051980619807198081980919810198111981219813198141981519816198171981819819198201982119822198231982419825198261982719828198291983019831198321983319834198351983619837198381983919840198411984219843198441984519846198471984819849198501985119852198531985419855198561985719858198591986019861198621986319864198651986619867198681986919870198711987219873198741987519876198771987819879198801988119882198831988419885198861988719888198891989019891198921989319894198951989619897198981989919900199011990219903199041990519906199071990819909199101991119912199131991419915199161991719918199191992019921199221992319924199251992619927199281992919930199311993219933199341993519936199371993819939199401994119942199431994419945199461994719948199491995019951199521995319954199551995619957199581995919960199611996219963199641996519966199671996819969199701997119972199731997419975199761997719978199791998019981199821998319984199851998619987199881998919990199911999219993199941999519996199971999819999200002000120002200032000420005200062000720008200092001020011200122001320014200152001620017200182001920020200212002220023200242002520026200272002820029200302003120032200332003420035200362003720038200392004020041200422004320044200452004620047200482004920050200512005220053200542005520056200572005820059200602006120062200632006420065200662006720068200692007020071200722007320074200752007620077200782007920080200812008220083200842008520086200872008820089200902009120092200932009420095200962009720098200992010020101201022010320104201052010620107201082010920110201112011220113201142011520116201172011820119201202012120122201232012420125201262012720128201292013020131201322013320134201352013620137201382013920140201412014220143201442014520146201472014820149201502015120152201532015420155201562015720158201592016020161201622016320164201652016620167201682016920170201712017220173201742017520176201772017820179201802018120182201832018420185201862018720188201892019020191201922019320194201952019620197201982019920200202012020220203202042020520206202072020820209202102021120212202132021420215202162021720218202192022020221202222022320224202252022620227202282022920230202312023220233202342023520236202372023820239202402024120242202432024420245202462024720248202492025020251202522025320254
  1. #include "llama-model.h"
  2. #include "llama-impl.h"
  3. #include "llama-mmap.h"
  4. #include "llama-batch.h"
  5. #include "llama-cparams.h"
  6. #include "llama-model-loader.h"
  7. #include "llama-kv-cache.h"
  8. #include "llama-kv-cache-iswa.h"
  9. #include "llama-memory-hybrid.h"
  10. #include "llama-memory-recurrent.h"
  11. #include "ggml-cpp.h"
  12. #include "ggml-delta.h"
  13. #include <algorithm>
  14. #include <cassert>
  15. #include <cmath>
  16. #include <cfloat>
  17. #include <cstring>
  18. #include <cmath>
  19. #include <functional>
  20. #include <map>
  21. #include <regex>
  22. #include <sstream>
  23. #include <stdexcept>
  24. const char * llm_type_name(llm_type type) {
  25. switch (type) {
  26. case LLM_TYPE_14M: return "14M";
  27. case LLM_TYPE_17M: return "17M";
  28. case LLM_TYPE_22M: return "22M";
  29. case LLM_TYPE_33M: return "33M";
  30. case LLM_TYPE_60M: return "60M";
  31. case LLM_TYPE_70M: return "70M";
  32. case LLM_TYPE_80M: return "80M";
  33. case LLM_TYPE_109M: return "109M";
  34. case LLM_TYPE_137M: return "137M";
  35. case LLM_TYPE_140M: return "140M";
  36. case LLM_TYPE_160M: return "160M";
  37. case LLM_TYPE_190M: return "190M";
  38. case LLM_TYPE_220M: return "220M";
  39. case LLM_TYPE_250M: return "250M";
  40. case LLM_TYPE_256M: return "256M";
  41. case LLM_TYPE_270M: return "270M";
  42. case LLM_TYPE_335M: return "335M";
  43. case LLM_TYPE_350M: return "350M";
  44. case LLM_TYPE_360M: return "360M";
  45. case LLM_TYPE_410M: return "410M";
  46. case LLM_TYPE_450M: return "450M";
  47. case LLM_TYPE_475M: return "475M";
  48. case LLM_TYPE_558M: return "558M";
  49. case LLM_TYPE_700M: return "700M";
  50. case LLM_TYPE_770M: return "770M";
  51. case LLM_TYPE_780M: return "780M";
  52. case LLM_TYPE_950M: return "950M";
  53. case LLM_TYPE_0_3B: return "0.3B";
  54. case LLM_TYPE_0_5B: return "0.5B";
  55. case LLM_TYPE_0_6B: return "0.6B";
  56. case LLM_TYPE_1B: return "1B";
  57. case LLM_TYPE_1_2B: return "1.2B";
  58. case LLM_TYPE_1_3B: return "1.3B";
  59. case LLM_TYPE_1_4B: return "1.4B";
  60. case LLM_TYPE_1_5B: return "1.5B";
  61. case LLM_TYPE_1_6B: return "1.6B";
  62. case LLM_TYPE_1_7B: return "1.7B";
  63. case LLM_TYPE_1_8B: return "1.8B";
  64. case LLM_TYPE_2B: return "2B";
  65. case LLM_TYPE_2_8B: return "2.8B";
  66. case LLM_TYPE_2_9B: return "2.9B";
  67. case LLM_TYPE_3B: return "3B";
  68. case LLM_TYPE_4B: return "4B";
  69. case LLM_TYPE_6B: return "6B";
  70. case LLM_TYPE_6_9B: return "6.9B";
  71. case LLM_TYPE_7B: return "7B";
  72. case LLM_TYPE_8B: return "8B";
  73. case LLM_TYPE_9B: return "9B";
  74. case LLM_TYPE_11B: return "11B";
  75. case LLM_TYPE_12B: return "12B";
  76. case LLM_TYPE_13B: return "13B";
  77. case LLM_TYPE_14B: return "14B";
  78. case LLM_TYPE_15B: return "15B";
  79. case LLM_TYPE_16B: return "16B";
  80. case LLM_TYPE_20B: return "20B";
  81. case LLM_TYPE_27B: return "27B";
  82. case LLM_TYPE_30B: return "30B";
  83. case LLM_TYPE_32B: return "32B";
  84. case LLM_TYPE_34B: return "34B";
  85. case LLM_TYPE_35B: return "35B";
  86. case LLM_TYPE_36B: return "36B";
  87. case LLM_TYPE_40B: return "40B";
  88. case LLM_TYPE_65B: return "65B";
  89. case LLM_TYPE_70B: return "70B";
  90. case LLM_TYPE_120B: return "120B";
  91. case LLM_TYPE_142B: return "142B";
  92. case LLM_TYPE_236B: return "236B";
  93. case LLM_TYPE_290B: return "290B";
  94. case LLM_TYPE_314B: return "314B";
  95. case LLM_TYPE_405B: return "405B";
  96. case LLM_TYPE_671B: return "671B";
  97. case LLM_TYPE_SMALL: return "0.1B";
  98. case LLM_TYPE_MEDIUM: return "0.4B";
  99. case LLM_TYPE_LARGE: return "0.8B";
  100. case LLM_TYPE_XL: return "1.5B";
  101. case LLM_TYPE_A1_7B: return "A1.7B";
  102. case LLM_TYPE_A2_7B: return "A2.7B";
  103. case LLM_TYPE_8x7B: return "8x7B";
  104. case LLM_TYPE_8x22B: return "8x22B";
  105. case LLM_TYPE_16x12B: return "16x12B";
  106. case LLM_TYPE_16x3_8B: return "16x3.8B";
  107. case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
  108. case LLM_TYPE_57B_A14B: return "57B.A14B";
  109. case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
  110. case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
  111. case LLM_TYPE_A13B: return "A13B";
  112. case LLM_TYPE_21B_A3B: return "21B.A3B";
  113. case LLM_TYPE_30B_A3B: return "30B.A3B";
  114. case LLM_TYPE_80B_A3B: return "80B.A3B";
  115. case LLM_TYPE_106B_A12B: return "106B.A12B";
  116. case LLM_TYPE_235B_A22B: return "235B.A22B";
  117. case LLM_TYPE_300B_A47B: return "300B.A47B";
  118. case LLM_TYPE_355B_A32B: return "355B.A32B";
  119. case LLM_TYPE_E2B: return "E2B";
  120. case LLM_TYPE_E4B: return "E4B";
  121. default: return "?B";
  122. }
  123. }
  124. static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
  125. switch (type) {
  126. case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
  127. case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
  128. default: return "unknown";
  129. }
  130. }
  131. static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
  132. { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
  133. { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
  134. { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
  135. { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
  136. };
  137. std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
  138. return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
  139. }
  140. static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
  141. for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
  142. if (kv.second == name) {
  143. return (llama_rope_scaling_type) kv.first;
  144. }
  145. }
  146. return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
  147. }
  148. // checks if the weight tensor can be used with the specified buffer type and device
  149. static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
  150. GGML_ASSERT(w != nullptr);
  151. if (op == GGML_OP_NONE) {
  152. return true;
  153. }
  154. ggml_init_params params = {
  155. /*.mem_size =*/ ggml_tensor_overhead()*8,
  156. /*.mem_buffer =*/ NULL,
  157. /*.no_alloc =*/ true,
  158. };
  159. ggml_context_ptr ctx_ptr { ggml_init(params) };
  160. if (!ctx_ptr) {
  161. throw std::runtime_error(format("failed to create ggml context"));
  162. }
  163. ggml_context * ctx = ctx_ptr.get();
  164. ggml_tensor * op_tensor = nullptr;
  165. switch (op) {
  166. case GGML_OP_GET_ROWS:
  167. {
  168. ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
  169. op_tensor = ggml_get_rows(ctx, w, b);
  170. } break;
  171. case GGML_OP_MUL_MAT:
  172. {
  173. ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
  174. op_tensor = ggml_mul_mat(ctx, w, b);
  175. } break;
  176. case GGML_OP_MUL_MAT_ID:
  177. {
  178. int n_expert_used = hparams.n_expert_used;
  179. ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
  180. ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
  181. op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
  182. } break;
  183. case GGML_OP_ADD:
  184. {
  185. ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
  186. op_tensor = ggml_add(ctx, a, w);
  187. } break;
  188. case GGML_OP_ADD_ID:
  189. {
  190. int n_expert_used = hparams.n_expert_used;
  191. ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
  192. ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
  193. op_tensor = ggml_add_id(ctx, a, w, c);
  194. } break;
  195. case GGML_OP_MUL:
  196. {
  197. ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
  198. op_tensor = ggml_mul(ctx, a, w);
  199. } break;
  200. case GGML_OP_DIV:
  201. {
  202. ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
  203. op_tensor = ggml_div(ctx, a, w);
  204. } break;
  205. case GGML_OP_ROPE:
  206. {
  207. int n_embd_head = hparams.n_embd_head_v;
  208. int n_head = hparams.n_head();
  209. ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
  210. ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
  211. op_tensor = ggml_rope_ext(
  212. ctx, a, b, w,
  213. 0, 0, 0, 0, 0,
  214. 0, 0, 0, 0
  215. );
  216. } break;
  217. case GGML_OP_SSM_CONV:
  218. {
  219. const int64_t n_seq_tokens = 512;
  220. const int64_t n_seqs = 3;
  221. ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
  222. op_tensor = ggml_ssm_conv(ctx, conv_x, w);
  223. } break;
  224. case GGML_OP_SSM_SCAN:
  225. {
  226. // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
  227. const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
  228. const int64_t n_head = w->ne[1];
  229. const int64_t head_dim = hparams.ssm_d_inner / n_head;
  230. const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
  231. const int64_t n_seq_tokens = 512;
  232. const int64_t n_seqs = 3;
  233. ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
  234. ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
  235. ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
  236. ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
  237. ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
  238. ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
  239. op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
  240. } break;
  241. case GGML_OP_RWKV_WKV6:
  242. {
  243. // FIXME
  244. const int64_t S = 123;
  245. const int64_t H = 123;
  246. const int64_t n_tokens = 123;
  247. const int64_t n_seqs = 123;
  248. ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  249. ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  250. ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  251. ggml_tensor * tf = w;
  252. ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  253. ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
  254. op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
  255. } break;
  256. case GGML_OP_IM2COL:
  257. {
  258. const int n_embd = hparams.n_embd;
  259. ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
  260. op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
  261. } break;
  262. case GGML_OP_SCALE:
  263. {
  264. op_tensor = ggml_scale(ctx, w, 1.0f);
  265. } break;
  266. default:
  267. GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
  268. }
  269. // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
  270. GGML_ASSERT(w->buffer == nullptr);
  271. w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
  272. bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  273. ggml_backend_buffer_free(w->buffer);
  274. w->buffer = nullptr;
  275. return op_supported;
  276. }
  277. // lists of buffer types used for each layer
  278. using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
  279. // find the first buffer type in the list that can use the tensor
  280. static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
  281. GGML_ASSERT(!buft_list.empty());
  282. for (const auto & cur : buft_list) {
  283. ggml_backend_dev_t cur_dev = cur.first;
  284. ggml_backend_buffer_type_t cur_buft = cur.second;
  285. if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
  286. return cur_buft;
  287. }
  288. }
  289. return nullptr;
  290. }
  291. // CPU: ACCEL -> GPU host -> CPU extra -> CPU
  292. static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts) {
  293. buft_list_t buft_list;
  294. // add ACCEL buffer types
  295. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  296. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  297. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
  298. auto * buft = ggml_backend_dev_buffer_type(dev);
  299. // skip
  300. if (buft != ggml_backend_cpu_buffer_type()) {
  301. buft_list.emplace_back(dev, buft);
  302. }
  303. }
  304. }
  305. // add a host buffer type
  306. // storing the tensors in a host buffer is useful when the processing of large batches
  307. // is offloaded to a GPU device, since it reduces the time spent on data transfers
  308. // generally, this will be done using the first device in the list
  309. // a better approach would be to handle this on a weight-by-weight basis using the offload_op
  310. // function of the device to determine if it would benefit from being stored in a host buffer
  311. for (auto * dev : devices) {
  312. ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
  313. if (buft) {
  314. buft_list.emplace_back(dev, buft);
  315. break;
  316. }
  317. }
  318. // add extra buffer types
  319. if (use_extra_bufts) {
  320. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  321. if (cpu_dev == nullptr) {
  322. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  323. }
  324. auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
  325. auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
  326. ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
  327. if (ggml_backend_dev_get_extra_bufts_fn) {
  328. ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
  329. while (extra_bufts && *extra_bufts) {
  330. buft_list.emplace_back(cpu_dev, *extra_bufts);
  331. ++extra_bufts;
  332. }
  333. }
  334. }
  335. // add the CPU buffer type
  336. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  337. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  338. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
  339. buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
  340. }
  341. }
  342. return buft_list;
  343. }
  344. // GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
  345. static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode split_mode, const float * tensor_split) {
  346. buft_list_t buft_list;
  347. // add the device split buffer type if requested and available
  348. if (split_mode == LLAMA_SPLIT_MODE_ROW) {
  349. ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
  350. auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
  351. ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
  352. if (ggml_backend_split_buffer_type_fn) {
  353. size_t dev_index = [&]() {
  354. auto * reg = ggml_backend_dev_backend_reg(dev);
  355. for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
  356. if (ggml_backend_reg_dev_get(reg, i) == dev) {
  357. return i;
  358. }
  359. }
  360. throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
  361. }();
  362. auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
  363. if (buft != nullptr) {
  364. buft_list.emplace_back(dev, buft);
  365. }
  366. }
  367. }
  368. // add the device default buffer type
  369. buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
  370. return buft_list;
  371. }
  372. struct llama_model::impl {
  373. impl() {}
  374. ~impl() {}
  375. uint64_t n_elements = 0;
  376. size_t n_bytes = 0;
  377. std::string desc_str;
  378. // model memory mapped files
  379. llama_mmaps mappings;
  380. // objects representing data potentially being locked in memory
  381. llama_mlocks mlock_bufs;
  382. llama_mlocks mlock_mmaps;
  383. // contexts where the model tensors metadata is stored
  384. std::vector<ggml_context_ptr> ctxs;
  385. // the model memory buffers for the tensor data
  386. std::vector<ggml_backend_buffer_ptr> bufs;
  387. buft_list_t cpu_buft_list;
  388. std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
  389. struct layer_dev {
  390. ggml_backend_dev_t dev;
  391. buft_list_t * buft_list;
  392. };
  393. layer_dev dev_input = {};
  394. layer_dev dev_output = {};
  395. std::vector<layer_dev> dev_layer;
  396. bool has_tensor_overrides;
  397. };
  398. llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
  399. pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
  400. }
  401. llama_model::~llama_model() {}
  402. void llama_model::load_stats(llama_model_loader & ml) {
  403. pimpl->n_elements = ml.n_elements;
  404. pimpl->n_bytes = ml.n_bytes;
  405. }
  406. void llama_model::load_arch(llama_model_loader & ml) {
  407. arch = ml.get_arch();
  408. if (arch == LLM_ARCH_UNKNOWN) {
  409. throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
  410. }
  411. }
  412. void llama_model::load_hparams(llama_model_loader & ml) {
  413. const gguf_context * ctx = ml.meta.get();
  414. // get metadata as string
  415. for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
  416. gguf_type type = gguf_get_kv_type(ctx, i);
  417. if (type == GGUF_TYPE_ARRAY) {
  418. continue;
  419. }
  420. const char * name = gguf_get_key(ctx, i);
  421. const std::string value = gguf_kv_to_str(ctx, i);
  422. gguf_kv.emplace(name, value);
  423. }
  424. // get general kv
  425. ml.get_key(LLM_KV_GENERAL_NAME, name, false);
  426. // everything past this point is not vocab-related
  427. if (hparams.vocab_only) {
  428. return;
  429. }
  430. ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
  431. ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
  432. ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
  433. ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
  434. ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
  435. if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
  436. ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
  437. ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
  438. ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
  439. ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
  440. ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
  441. }
  442. GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
  443. GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
  444. if (hparams.n_expert > 0) {
  445. GGML_ASSERT(hparams.n_expert_used > 0);
  446. } else {
  447. GGML_ASSERT(hparams.n_expert_used == 0);
  448. }
  449. std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
  450. std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
  451. std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
  452. std::fill(
  453. hparams.recurrent_layer_arr.begin(),
  454. hparams.recurrent_layer_arr.end(),
  455. llm_arch_is_recurrent(ml.get_arch()));
  456. std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
  457. std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
  458. ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
  459. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
  460. // n_head_kv is optional, default to n_head
  461. hparams.n_head_kv_arr = hparams.n_head_arr;
  462. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
  463. bool rope_finetuned = false;
  464. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  465. hparams.rope_finetuned = rope_finetuned;
  466. hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
  467. ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
  468. // rope_freq_base (optional)
  469. hparams.rope_freq_base_train = 10000.0f;
  470. ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
  471. std::string rope_scaling("linear");
  472. ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
  473. hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
  474. GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
  475. // rope_freq_scale (inverse of the kv) is optional
  476. float ropescale = 0.0f;
  477. if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
  478. // try the old key name
  479. ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
  480. }
  481. hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
  482. // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
  483. hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
  484. hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
  485. ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
  486. // non-transformer models do not have attention heads
  487. if (hparams.n_head() > 0) {
  488. // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
  489. // gpt-j n_rot = rotary_dim
  490. hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
  491. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
  492. hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
  493. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
  494. // sanity check for n_rot (optional)
  495. hparams.n_rot = hparams.n_embd_head_k;
  496. ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
  497. if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
  498. if (hparams.n_rot != hparams.n_embd_head_k) {
  499. throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
  500. }
  501. }
  502. } else {
  503. hparams.n_rot = 0;
  504. hparams.n_embd_head_k = 0;
  505. hparams.n_embd_head_v = 0;
  506. }
  507. // for differentiating model types
  508. uint32_t n_vocab = 0;
  509. ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
  510. // for classifier models
  511. ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
  512. if (!classifier_labels.empty()) {
  513. hparams.n_cls_out = classifier_labels.size();
  514. }
  515. // arch-specific KVs
  516. switch (arch) {
  517. case LLM_ARCH_LLAMA:
  518. {
  519. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  520. if (hparams.n_expert == 8) {
  521. switch (hparams.n_layer) {
  522. case 32: type = LLM_TYPE_8x7B; break;
  523. case 56: type = LLM_TYPE_8x22B; break;
  524. default: type = LLM_TYPE_UNKNOWN;
  525. }
  526. } else {
  527. switch (hparams.n_layer) {
  528. case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
  529. case 22: type = LLM_TYPE_1B; break;
  530. case 26: type = LLM_TYPE_3B; break;
  531. case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
  532. case 30: type = LLM_TYPE_256M; break; // smoldocling 256M
  533. // granite uses a vocab with len 49152
  534. case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
  535. case 36: type = LLM_TYPE_8B; break; // granite
  536. case 40: type = LLM_TYPE_13B; break;
  537. case 48: type = LLM_TYPE_34B; break;
  538. case 60: type = LLM_TYPE_30B; break;
  539. case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
  540. default: type = LLM_TYPE_UNKNOWN;
  541. }
  542. }
  543. } break;
  544. case LLM_ARCH_LLAMA4:
  545. {
  546. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  547. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  548. ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
  549. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  550. if (found_swa && hparams.n_swa == 0) {
  551. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  552. hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
  553. } else {
  554. hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
  555. hparams.n_swa = 8192;
  556. hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
  557. }
  558. switch (hparams.n_expert) {
  559. case 0: {
  560. // MobileLLM (no MoE)
  561. switch (hparams.n_embd) {
  562. case 2048: type = LLM_TYPE_140M; break;
  563. case 4096: type = LLM_TYPE_360M; break;
  564. case 6144: type = LLM_TYPE_950M; break;
  565. default: type = LLM_TYPE_UNKNOWN;
  566. }
  567. } break;
  568. case 16: type = LLM_TYPE_17B_16E; break;
  569. case 128: type = LLM_TYPE_17B_128E; break;
  570. default: type = LLM_TYPE_UNKNOWN;
  571. }
  572. hparams.use_kq_norm = type != LLM_TYPE_17B_128E;
  573. } break;
  574. case LLM_ARCH_ARCEE:
  575. {
  576. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  577. // Arcee uses the same structure as Llama
  578. switch (hparams.n_layer) {
  579. case 36: type = LLM_TYPE_4B; break;
  580. default: type = LLM_TYPE_UNKNOWN;
  581. }
  582. } break;
  583. case LLM_ARCH_DECI:
  584. {
  585. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  586. switch (hparams.n_layer) {
  587. case 32: type = LLM_TYPE_7B; break;
  588. case 80: type = LLM_TYPE_70B; break;
  589. case 162: type = LLM_TYPE_405B; break;
  590. default: type = LLM_TYPE_UNKNOWN;
  591. }
  592. } break;
  593. case LLM_ARCH_MINICPM:
  594. {
  595. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  596. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  597. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  598. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  599. // MiniCPM uses rope by default, unlike Granite which uses it as a switch
  600. hparams.rope_finetuned = true;
  601. switch (hparams.n_layer) {
  602. case 52: type = LLM_TYPE_1B; break;
  603. case 40: type = LLM_TYPE_2B; break;
  604. default: type = LLM_TYPE_UNKNOWN;
  605. }
  606. } break;
  607. case LLM_ARCH_MINICPM3:
  608. {
  609. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  610. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  611. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  612. switch (hparams.n_layer) {
  613. case 62: type = LLM_TYPE_4B; break;
  614. default: type = LLM_TYPE_UNKNOWN;
  615. }
  616. } break;
  617. case LLM_ARCH_GROK:
  618. {
  619. // defaults for old GGUFs
  620. hparams.yarn_beta_fast = 8.0f;
  621. hparams.f_logit_scale = 0.5773502691896257f;
  622. hparams.f_embedding_scale = 78.38367176906169f;
  623. hparams.f_attn_out_scale = 0.08838834764831845f;
  624. hparams.f_attn_logit_softcapping = 30.0f;
  625. hparams.f_router_logit_softcapping = 30.0f;
  626. // no final_logit_softcapping in grok-1
  627. hparams.f_final_logit_softcapping = 0.0f;
  628. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  629. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  630. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false);
  631. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false);
  632. ml.get_key(LLM_KV_ATTENTION_OUTPUT_SCALE, hparams.f_attn_out_scale, false);
  633. ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
  634. ml.get_key(LLM_KV_ROUTER_LOGIT_SOFTCAPPING, hparams.f_router_logit_softcapping, false);
  635. ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
  636. ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.attn_temp_length, false);
  637. ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor, false);
  638. ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
  639. ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
  640. ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
  641. switch (hparams.n_layer) {
  642. case 64: type = LLM_TYPE_314B; break;
  643. default: type = LLM_TYPE_UNKNOWN;
  644. }
  645. } break;
  646. case LLM_ARCH_FALCON:
  647. {
  648. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  649. switch (hparams.n_layer) {
  650. case 32: type = LLM_TYPE_7B; break;
  651. case 60: type = LLM_TYPE_40B; break;
  652. default: type = LLM_TYPE_UNKNOWN;
  653. }
  654. } break;
  655. case LLM_ARCH_BAICHUAN:
  656. {
  657. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  658. switch (hparams.n_layer) {
  659. case 32: type = LLM_TYPE_7B; break;
  660. case 40: type = LLM_TYPE_13B; break;
  661. default: type = LLM_TYPE_UNKNOWN;
  662. }
  663. if (type == LLM_TYPE_13B) {
  664. // TODO: become GGUF KV parameter
  665. hparams.f_max_alibi_bias = 8.0f;
  666. }
  667. } break;
  668. case LLM_ARCH_STARCODER:
  669. {
  670. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  671. switch (hparams.n_layer) {
  672. case 24: type = LLM_TYPE_1B; break;
  673. case 36: type = LLM_TYPE_3B; break;
  674. case 42: type = LLM_TYPE_7B; break;
  675. case 40: type = LLM_TYPE_15B; break;
  676. default: type = LLM_TYPE_UNKNOWN;
  677. }
  678. } break;
  679. case LLM_ARCH_REFACT:
  680. {
  681. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  682. switch (hparams.n_layer) {
  683. case 32: type = LLM_TYPE_1B; break;
  684. default: type = LLM_TYPE_UNKNOWN;
  685. }
  686. // TODO: become GGUF KV parameter
  687. hparams.f_max_alibi_bias = 8.0f;
  688. } break;
  689. case LLM_ARCH_BERT:
  690. {
  691. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  692. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  693. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  694. switch (hparams.n_layer) {
  695. case 3:
  696. type = LLM_TYPE_17M; break; // bge-micro
  697. case 6:
  698. type = LLM_TYPE_22M; break; // MiniLM-L6
  699. case 12:
  700. switch (hparams.n_embd) {
  701. case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
  702. case 768: type = LLM_TYPE_109M; break; // bge-base
  703. default: type = LLM_TYPE_UNKNOWN;
  704. } break;
  705. case 24:
  706. type = LLM_TYPE_335M; break; // bge-large
  707. default: type = LLM_TYPE_UNKNOWN;
  708. }
  709. } break;
  710. case LLM_ARCH_JINA_BERT_V2:
  711. {
  712. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  713. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  714. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  715. hparams.f_max_alibi_bias = 8.0f;
  716. switch (hparams.n_layer) {
  717. case 4: type = LLM_TYPE_33M; break; // jina-embeddings-small
  718. case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
  719. default: type = LLM_TYPE_UNKNOWN;
  720. }
  721. } break;
  722. case LLM_ARCH_JINA_BERT_V3:
  723. {
  724. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  725. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  726. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  727. switch (hparams.n_layer) {
  728. case 24:
  729. type = LLM_TYPE_558M; break;
  730. default: type = LLM_TYPE_UNKNOWN;
  731. }
  732. } break;
  733. case LLM_ARCH_NOMIC_BERT:
  734. case LLM_ARCH_NOMIC_BERT_MOE:
  735. {
  736. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  737. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  738. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  739. ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
  740. if (hparams.n_layer == 12 && hparams.n_embd == 768) {
  741. if (arch == LLM_ARCH_NOMIC_BERT) {
  742. type = LLM_TYPE_137M;
  743. } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
  744. type = LLM_TYPE_475M;
  745. }
  746. }
  747. } break;
  748. case LLM_ARCH_NEO_BERT:
  749. {
  750. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  751. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  752. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  753. if (hparams.n_layer == 28) {
  754. type = LLM_TYPE_250M;
  755. }
  756. } break;
  757. case LLM_ARCH_BLOOM:
  758. {
  759. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  760. switch (hparams.n_layer) {
  761. case 24: type = LLM_TYPE_1B; break;
  762. case 30:
  763. switch (hparams.n_embd) {
  764. case 2560: type = LLM_TYPE_3B; break;
  765. case 4096: type = LLM_TYPE_7B; break;
  766. default: type = LLM_TYPE_UNKNOWN;
  767. } break;
  768. default: type = LLM_TYPE_UNKNOWN;
  769. }
  770. // TODO: become GGUF KV parameter
  771. hparams.f_max_alibi_bias = 8.0f;
  772. } break;
  773. case LLM_ARCH_MPT:
  774. {
  775. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  776. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  777. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  778. switch (hparams.n_layer) {
  779. case 32: type = LLM_TYPE_7B; break;
  780. case 48: type = LLM_TYPE_30B; break;
  781. default: type = LLM_TYPE_UNKNOWN;
  782. }
  783. } break;
  784. case LLM_ARCH_STABLELM:
  785. {
  786. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  787. switch (hparams.n_layer) {
  788. case 24: type = LLM_TYPE_1B; break;
  789. case 32: type = LLM_TYPE_3B; break;
  790. case 40: type = LLM_TYPE_12B; break;
  791. default: type = LLM_TYPE_UNKNOWN;
  792. }
  793. } break;
  794. case LLM_ARCH_QWEN:
  795. {
  796. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  797. switch (hparams.n_layer) {
  798. case 32: type = LLM_TYPE_7B; break;
  799. case 40: type = LLM_TYPE_13B; break;
  800. default: type = LLM_TYPE_UNKNOWN;
  801. }
  802. } break;
  803. case LLM_ARCH_QWEN2VL:
  804. {
  805. ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
  806. }
  807. // fall through
  808. case LLM_ARCH_QWEN2:
  809. {
  810. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  811. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  812. switch (hparams.n_layer) {
  813. case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
  814. case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
  815. case 32: type = LLM_TYPE_7B; break;
  816. case 36: type = LLM_TYPE_3B; break;
  817. case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
  818. case 48: type = LLM_TYPE_14B; break;
  819. case 64: type = LLM_TYPE_32B; break;
  820. case 80: type = LLM_TYPE_70B; break;
  821. default: type = LLM_TYPE_UNKNOWN;
  822. }
  823. } break;
  824. case LLM_ARCH_DREAM:
  825. {
  826. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  827. // Dream models are primarily 7B with 28 layers
  828. switch (hparams.n_layer) {
  829. case 28:
  830. type = LLM_TYPE_7B;
  831. break;
  832. default:
  833. type = LLM_TYPE_UNKNOWN;
  834. }
  835. // Set non-causal attention for diffusion models
  836. hparams.causal_attn = false;
  837. }
  838. break;
  839. case LLM_ARCH_LLADA:
  840. {
  841. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  842. // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
  843. switch (hparams.n_layer) {
  844. case 32:
  845. type = LLM_TYPE_8B;
  846. break;
  847. default:
  848. type = LLM_TYPE_UNKNOWN;
  849. }
  850. // Set non-causal attention for diffusion models
  851. hparams.causal_attn = false;
  852. }
  853. break;
  854. case LLM_ARCH_LLADA_MOE:
  855. {
  856. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  857. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  858. // diffusion language model uses non-causal attention
  859. hparams.causal_attn = false;
  860. switch (hparams.n_layer) {
  861. case 16: type = LLM_TYPE_A1_7B; break;
  862. default: type = LLM_TYPE_UNKNOWN;
  863. }
  864. } break;
  865. case LLM_ARCH_QWEN2MOE:
  866. {
  867. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  868. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  869. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  870. switch (hparams.n_layer) {
  871. case 24: type = LLM_TYPE_A2_7B; break;
  872. case 28: type = LLM_TYPE_57B_A14B; break;
  873. default: type = LLM_TYPE_UNKNOWN;
  874. }
  875. } break;
  876. case LLM_ARCH_QWEN3:
  877. {
  878. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  879. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  880. switch (hparams.n_layer) {
  881. case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
  882. case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
  883. case 40: type = LLM_TYPE_14B; break;
  884. case 64: type = LLM_TYPE_32B; break;
  885. default: type = LLM_TYPE_UNKNOWN;
  886. }
  887. } break;
  888. case LLM_ARCH_QWEN3MOE:
  889. {
  890. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  891. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  892. switch (hparams.n_layer) {
  893. case 48: type = LLM_TYPE_30B_A3B; break;
  894. case 94: type = LLM_TYPE_235B_A22B; break;
  895. default: type = LLM_TYPE_UNKNOWN;
  896. }
  897. } break;
  898. case LLM_ARCH_PHI2:
  899. {
  900. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  901. switch (hparams.n_layer) {
  902. case 24: type = LLM_TYPE_1B; break;
  903. case 32: type = LLM_TYPE_3B; break;
  904. default: type = LLM_TYPE_UNKNOWN;
  905. }
  906. } break;
  907. case LLM_ARCH_PHI3:
  908. {
  909. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  910. switch (hparams.n_layer) {
  911. case 24: type = LLM_TYPE_1B; break;
  912. case 32: type = LLM_TYPE_3B; break;
  913. case 40: type = LLM_TYPE_14B; break;
  914. default: type = LLM_TYPE_UNKNOWN;
  915. }
  916. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  917. if (found_swa && hparams.n_swa > 0) {
  918. LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
  919. __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
  920. // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
  921. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  922. hparams.n_swa = 0;
  923. hparams.set_swa_pattern(1);
  924. }
  925. } break;
  926. case LLM_ARCH_PHIMOE:
  927. {
  928. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  929. switch (hparams.n_layer) {
  930. case 32: type = LLM_TYPE_16x3_8B; break;
  931. default: type = LLM_TYPE_UNKNOWN;
  932. }
  933. } break;
  934. case LLM_ARCH_PLAMO:
  935. {
  936. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  937. switch (hparams.n_layer) {
  938. case 40: type = LLM_TYPE_13B; break;
  939. default: type = LLM_TYPE_UNKNOWN;
  940. }
  941. } break;
  942. case LLM_ARCH_PLAMO2:
  943. {
  944. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  945. // Load Mamba SSM parameters
  946. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  947. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  948. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  949. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  950. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  951. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  952. hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
  953. }
  954. switch (hparams.n_layer) {
  955. case 16: type = LLM_TYPE_1B; break;
  956. case 32:
  957. if (hparams.n_embd == 2048) {
  958. type = LLM_TYPE_2B;
  959. } else if (hparams.n_embd == 4096) {
  960. type = LLM_TYPE_8B;
  961. }
  962. break;
  963. default: type = LLM_TYPE_UNKNOWN;
  964. }
  965. } break;
  966. case LLM_ARCH_GPT2:
  967. {
  968. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  969. switch (hparams.n_layer) {
  970. case 12: type = LLM_TYPE_SMALL; break;
  971. case 24: type = LLM_TYPE_MEDIUM; break;
  972. case 36: type = LLM_TYPE_LARGE; break;
  973. case 48: type = LLM_TYPE_XL; break;
  974. default: type = LLM_TYPE_UNKNOWN;
  975. }
  976. } break;
  977. case LLM_ARCH_CODESHELL:
  978. {
  979. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  980. switch (hparams.n_layer) {
  981. case 42: type = LLM_TYPE_7B; break;
  982. default: type = LLM_TYPE_UNKNOWN;
  983. }
  984. } break;
  985. case LLM_ARCH_ORION:
  986. {
  987. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  988. switch (hparams.n_layer) {
  989. case 40: type = LLM_TYPE_14B; break;
  990. default: type = LLM_TYPE_UNKNOWN;
  991. }
  992. } break;
  993. case LLM_ARCH_INTERNLM2:
  994. {
  995. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  996. switch (hparams.n_layer) {
  997. case 32: type = LLM_TYPE_7B; break;
  998. case 48: type = LLM_TYPE_20B; break;
  999. default: type = LLM_TYPE_UNKNOWN;
  1000. }
  1001. } break;
  1002. case LLM_ARCH_GEMMA:
  1003. {
  1004. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1005. switch (hparams.n_layer) {
  1006. case 18: type = LLM_TYPE_2B; break;
  1007. case 28: type = LLM_TYPE_7B; break;
  1008. default: type = LLM_TYPE_UNKNOWN;
  1009. }
  1010. } break;
  1011. case LLM_ARCH_GEMMA2:
  1012. {
  1013. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1014. hparams.n_swa = 4096; // default value of gemma 2
  1015. hparams.set_swa_pattern(2);
  1016. hparams.attn_soft_cap = true;
  1017. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1018. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1019. ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
  1020. ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
  1021. switch (hparams.n_layer) {
  1022. case 26: type = LLM_TYPE_2B; break;
  1023. case 42: type = LLM_TYPE_9B; break;
  1024. case 46: type = LLM_TYPE_27B; break;
  1025. default: type = LLM_TYPE_UNKNOWN;
  1026. }
  1027. // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
  1028. hparams.f_attention_scale = type == LLM_TYPE_27B
  1029. ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
  1030. : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
  1031. } break;
  1032. case LLM_ARCH_GEMMA3:
  1033. {
  1034. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1035. hparams.set_swa_pattern(6);
  1036. hparams.rope_freq_base_train_swa = 10000.0f;
  1037. hparams.rope_freq_scale_train_swa = 1.0f;
  1038. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1039. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1040. switch (hparams.n_layer) {
  1041. case 18: type = LLM_TYPE_270M; break;
  1042. case 26: type = LLM_TYPE_1B; break;
  1043. case 34: type = LLM_TYPE_4B; break;
  1044. case 48: type = LLM_TYPE_12B; break;
  1045. case 62: type = LLM_TYPE_27B; break;
  1046. default: type = LLM_TYPE_UNKNOWN;
  1047. }
  1048. // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
  1049. hparams.f_attention_scale = type == LLM_TYPE_27B
  1050. ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
  1051. : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
  1052. } break;
  1053. case LLM_ARCH_GEMMA3N:
  1054. {
  1055. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1056. hparams.set_swa_pattern(5);
  1057. hparams.n_layer_kv_from_start = 20;
  1058. hparams.rope_freq_base_train_swa = 10000.0f;
  1059. hparams.rope_freq_scale_train_swa = 1.0f;
  1060. hparams.f_attention_scale = 1.0f;
  1061. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1062. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1063. switch (hparams.n_layer) {
  1064. case 30: type = LLM_TYPE_E2B; break;
  1065. case 35: type = LLM_TYPE_E4B; break;
  1066. default: type = LLM_TYPE_UNKNOWN;
  1067. }
  1068. } break;
  1069. case LLM_ARCH_GEMMA_EMBEDDING:
  1070. {
  1071. hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
  1072. hparams.set_swa_pattern(6);
  1073. hparams.causal_attn = false; // embeddings do not use causal attention
  1074. hparams.rope_freq_base_train_swa = 10000.0f;
  1075. hparams.rope_freq_scale_train_swa = 1.0f;
  1076. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1077. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1078. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  1079. switch (hparams.n_layer) {
  1080. case 24: type = LLM_TYPE_0_3B; break;
  1081. default: type = LLM_TYPE_UNKNOWN;
  1082. }
  1083. hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
  1084. } break;
  1085. case LLM_ARCH_STARCODER2:
  1086. {
  1087. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1088. switch (hparams.n_layer) {
  1089. case 30: type = LLM_TYPE_3B; break;
  1090. case 32: type = LLM_TYPE_7B; break;
  1091. case 40: type = LLM_TYPE_15B; break;
  1092. case 52: type = LLM_TYPE_20B; break; // granite
  1093. case 88: type = LLM_TYPE_34B; break; // granite
  1094. default: type = LLM_TYPE_UNKNOWN;
  1095. }
  1096. } break;
  1097. case LLM_ARCH_MAMBA:
  1098. {
  1099. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1100. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1101. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1102. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1103. ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
  1104. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1105. switch (hparams.n_layer) {
  1106. case 24:
  1107. switch (hparams.n_embd) {
  1108. case 768: type = LLM_TYPE_SMALL; break;
  1109. default: type = LLM_TYPE_UNKNOWN;
  1110. } break;
  1111. case 48:
  1112. switch (hparams.n_embd) {
  1113. case 1024: type = LLM_TYPE_MEDIUM; break;
  1114. case 1536: type = LLM_TYPE_LARGE; break;
  1115. case 2048: type = LLM_TYPE_XL; break;
  1116. default: type = LLM_TYPE_UNKNOWN;
  1117. } break;
  1118. case 64:
  1119. switch (hparams.n_embd) {
  1120. case 2560: type = LLM_TYPE_3B; break;
  1121. default: type = LLM_TYPE_UNKNOWN;
  1122. } break;
  1123. default: type = LLM_TYPE_UNKNOWN;
  1124. }
  1125. } break;
  1126. case LLM_ARCH_MAMBA2:
  1127. {
  1128. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1129. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1130. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1131. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1132. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1133. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1134. switch (hparams.n_layer) {
  1135. case 24:
  1136. switch (hparams.n_embd) {
  1137. case 768: type = LLM_TYPE_SMALL; break;
  1138. default: type = LLM_TYPE_UNKNOWN;
  1139. } break;
  1140. case 48:
  1141. switch (hparams.n_embd) {
  1142. case 1024: type = LLM_TYPE_MEDIUM; break;
  1143. case 1536: type = LLM_TYPE_LARGE; break;
  1144. case 2048: type = LLM_TYPE_XL; break;
  1145. default: type = LLM_TYPE_UNKNOWN;
  1146. } break;
  1147. case 64:
  1148. switch (hparams.n_embd) {
  1149. case 2560: type = LLM_TYPE_3B; break;
  1150. case 4096: type = LLM_TYPE_7B; break;
  1151. default: type = LLM_TYPE_UNKNOWN;
  1152. } break;
  1153. default: type = LLM_TYPE_UNKNOWN;
  1154. }
  1155. } break;
  1156. case LLM_ARCH_JAMBA:
  1157. {
  1158. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1159. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1160. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1161. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1162. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1163. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1164. hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
  1165. }
  1166. switch (hparams.n_layer) {
  1167. // TODO: Jamba layers are a bit heterogenous, so naming this is hard.
  1168. case 12: // 900M 8x???M
  1169. case 32: // 51B 16x?B
  1170. default: type = LLM_TYPE_UNKNOWN;
  1171. }
  1172. } break;
  1173. case LLM_ARCH_XVERSE:
  1174. {
  1175. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1176. switch (hparams.n_layer) {
  1177. case 32: type = LLM_TYPE_7B; break;
  1178. case 40: type = LLM_TYPE_13B; break;
  1179. case 80: type = LLM_TYPE_65B; break;
  1180. default: type = LLM_TYPE_UNKNOWN;
  1181. }
  1182. } break;
  1183. case LLM_ARCH_COMMAND_R:
  1184. {
  1185. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1186. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1187. switch (hparams.n_layer) {
  1188. case 40: type = LLM_TYPE_35B; break;
  1189. default: type = LLM_TYPE_UNKNOWN;
  1190. }
  1191. } break;
  1192. case LLM_ARCH_COHERE2:
  1193. {
  1194. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1195. hparams.set_swa_pattern(4);
  1196. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1197. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1198. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1199. switch (hparams.n_layer) {
  1200. case 32: type = LLM_TYPE_8B; break;
  1201. default: type = LLM_TYPE_UNKNOWN;
  1202. }
  1203. } break;
  1204. case LLM_ARCH_DBRX:
  1205. {
  1206. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1207. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
  1208. switch (hparams.n_layer) {
  1209. case 40: type = LLM_TYPE_16x12B; break;
  1210. default: type = LLM_TYPE_UNKNOWN;
  1211. }
  1212. } break;
  1213. case LLM_ARCH_OLMO:
  1214. {
  1215. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1216. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  1217. switch (hparams.n_layer) {
  1218. case 22: type = LLM_TYPE_1B; break;
  1219. case 32: type = LLM_TYPE_7B; break;
  1220. case 80: type = LLM_TYPE_70B; break;
  1221. default: type = LLM_TYPE_UNKNOWN;
  1222. }
  1223. } break;
  1224. case LLM_ARCH_OLMO2:
  1225. {
  1226. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1227. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1228. if (found_swa && hparams.n_swa > 0) {
  1229. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1230. hparams.set_swa_pattern(4);
  1231. } else {
  1232. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  1233. }
  1234. switch (hparams.n_layer) {
  1235. case 16: type = LLM_TYPE_1B; break;
  1236. case 32: type = LLM_TYPE_7B; break;
  1237. case 40: type = LLM_TYPE_13B; break;
  1238. case 64: type = LLM_TYPE_32B; break;
  1239. default: type = LLM_TYPE_UNKNOWN;
  1240. }
  1241. } break;
  1242. case LLM_ARCH_SEED_OSS:
  1243. {
  1244. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1245. switch (hparams.n_layer) {
  1246. case 64: type = LLM_TYPE_36B; break;
  1247. default: type = LLM_TYPE_UNKNOWN;
  1248. }
  1249. } break;
  1250. case LLM_ARCH_OLMOE:
  1251. {
  1252. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1253. switch (hparams.n_layer) {
  1254. case 16: type = LLM_TYPE_A1_7B; break;
  1255. default: type = LLM_TYPE_UNKNOWN;
  1256. }
  1257. } break;
  1258. case LLM_ARCH_OPENELM:
  1259. {
  1260. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1261. switch (hparams.n_layer) {
  1262. case 16: type = LLM_TYPE_270M; break;
  1263. case 20: type = LLM_TYPE_450M; break;
  1264. case 28: type = LLM_TYPE_1B; break;
  1265. case 36: type = LLM_TYPE_3B; break;
  1266. default: type = LLM_TYPE_UNKNOWN;
  1267. }
  1268. } break;
  1269. case LLM_ARCH_GPTNEOX:
  1270. {
  1271. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1272. ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
  1273. switch (hparams.n_layer) {
  1274. case 6:
  1275. switch (hparams.n_ff()) {
  1276. case 512: type = LLM_TYPE_14M; break;
  1277. case 2048: type = LLM_TYPE_70M; break;
  1278. default: type = LLM_TYPE_UNKNOWN;
  1279. } break;
  1280. case 12:
  1281. switch (hparams.n_ff()) {
  1282. case 3072: type = LLM_TYPE_160M; break;
  1283. default: type = LLM_TYPE_UNKNOWN;
  1284. } break;
  1285. case 16:
  1286. switch (hparams.n_ff()) {
  1287. case 8192: type = LLM_TYPE_1B; break;
  1288. default: type = LLM_TYPE_UNKNOWN;
  1289. } break;
  1290. case 24:
  1291. switch (hparams.n_ff()) {
  1292. case 4096: type = LLM_TYPE_410M; break;
  1293. case 8192: type = LLM_TYPE_1_4B; break;
  1294. default: type = LLM_TYPE_UNKNOWN;
  1295. } break;
  1296. case 32:
  1297. switch (hparams.n_ff()) {
  1298. case 10240: type = LLM_TYPE_2_8B; break;
  1299. case 16384: type = LLM_TYPE_6_9B; break;
  1300. default: type = LLM_TYPE_UNKNOWN;
  1301. } break;
  1302. case 36:
  1303. switch (hparams.n_ff()) {
  1304. case 20480: type = LLM_TYPE_12B; break;
  1305. default: type = LLM_TYPE_UNKNOWN;
  1306. } break;
  1307. case 44:
  1308. switch (hparams.n_ff()) {
  1309. case 24576: type = LLM_TYPE_20B; break;
  1310. default: type = LLM_TYPE_UNKNOWN;
  1311. } break;
  1312. default: type = LLM_TYPE_UNKNOWN;
  1313. }
  1314. } break;
  1315. case LLM_ARCH_ARCTIC:
  1316. {
  1317. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1318. if (hparams.n_expert == 128) {
  1319. switch (hparams.n_layer) {
  1320. case 35: type = LLM_TYPE_10B_128x3_66B; break;
  1321. default: type = LLM_TYPE_UNKNOWN;
  1322. }
  1323. } else {
  1324. type = LLM_TYPE_UNKNOWN;
  1325. }
  1326. } break;
  1327. case LLM_ARCH_DEEPSEEK:
  1328. {
  1329. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1330. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1331. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1332. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1333. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1334. switch (hparams.n_layer) {
  1335. case 28: type = LLM_TYPE_20B; break;
  1336. default: type = LLM_TYPE_UNKNOWN;
  1337. }
  1338. } break;
  1339. case LLM_ARCH_DEEPSEEK2:
  1340. {
  1341. bool is_lite = (hparams.n_layer == 27);
  1342. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1343. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1344. if (!is_lite) {
  1345. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  1346. }
  1347. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  1348. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
  1349. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
  1350. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1351. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1352. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1353. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1354. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1355. if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
  1356. // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
  1357. // that have no expert_gating_func model parameter set
  1358. hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
  1359. }
  1360. ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
  1361. switch (hparams.n_layer) {
  1362. case 27: type = LLM_TYPE_16B; break;
  1363. case 60: type = LLM_TYPE_236B; break;
  1364. case 61: type = LLM_TYPE_671B; break;
  1365. default: type = LLM_TYPE_UNKNOWN;
  1366. }
  1367. } break;
  1368. case LLM_ARCH_PLM:
  1369. {
  1370. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1371. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  1372. switch (hparams.n_layer) {
  1373. case 32: type = LLM_TYPE_1_8B; break;
  1374. default: type = LLM_TYPE_UNKNOWN;
  1375. }
  1376. } break;
  1377. case LLM_ARCH_CHATGLM:
  1378. {
  1379. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1380. switch (hparams.n_layer) {
  1381. case 28: {
  1382. if (hparams.n_head(0) == 16) {
  1383. type = LLM_TYPE_1_5B;
  1384. } else {
  1385. type = LLM_TYPE_6B;
  1386. }
  1387. } break;
  1388. case 40: {
  1389. if (hparams.n_head(0) == 24) {
  1390. type = LLM_TYPE_4B;
  1391. } else {
  1392. type = LLM_TYPE_9B;
  1393. }
  1394. } break;
  1395. default: type = LLM_TYPE_UNKNOWN;
  1396. }
  1397. } break;
  1398. case LLM_ARCH_GLM4:
  1399. {
  1400. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1401. switch (hparams.n_layer) {
  1402. case 40: type = LLM_TYPE_9B; break;
  1403. case 61: type = LLM_TYPE_32B; break;
  1404. default: type = LLM_TYPE_UNKNOWN;
  1405. }
  1406. } break;
  1407. case LLM_ARCH_GLM4_MOE:
  1408. {
  1409. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1410. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1411. // MoE parameters
  1412. ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
  1413. ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
  1414. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1415. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
  1416. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1417. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1418. // Expert gating function (GLM-4.5 uses sigmoid)
  1419. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1420. if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
  1421. hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
  1422. }
  1423. // NextN/MTP parameters
  1424. ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
  1425. // TODO: when MTP is implemented, this should probably be updated if needed
  1426. hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
  1427. switch (hparams.n_layer) {
  1428. case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
  1429. case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
  1430. default: type = LLM_TYPE_UNKNOWN;
  1431. }
  1432. } break;
  1433. case LLM_ARCH_BITNET:
  1434. {
  1435. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1436. switch (hparams.n_layer) {
  1437. case 26: type = LLM_TYPE_3B; break;
  1438. default: type = LLM_TYPE_UNKNOWN;
  1439. }
  1440. } break;
  1441. case LLM_ARCH_T5:
  1442. {
  1443. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1444. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  1445. uint32_t dec_start_token_id;
  1446. if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
  1447. hparams.dec_start_token_id = dec_start_token_id;
  1448. }
  1449. hparams.dec_n_layer = hparams.n_layer;
  1450. ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false);
  1451. switch (hparams.n_layer) {
  1452. case 6: type = LLM_TYPE_60M; break; // t5-small
  1453. case 8: type = LLM_TYPE_80M; break; // flan-t5-small
  1454. case 12:
  1455. switch (hparams.n_ff()) {
  1456. case 3072: type = LLM_TYPE_220M; break; // t5-base
  1457. case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
  1458. default: type = LLM_TYPE_UNKNOWN;
  1459. } break;
  1460. case 24:
  1461. switch (hparams.n_ff()) {
  1462. case 4096: type = LLM_TYPE_770M; break; // t5-large
  1463. case 2816: type = LLM_TYPE_780M; break; // flan-t5-large
  1464. case 16384: type = LLM_TYPE_3B; break; // t5-3b
  1465. case 5120: type = LLM_TYPE_3B; break; // flan-t5-xl
  1466. case 65536: type = LLM_TYPE_11B; break; // t5-11b
  1467. case 10240: type = LLM_TYPE_11B; break; // flan-t5-xxl
  1468. default: type = LLM_TYPE_UNKNOWN;
  1469. } break;
  1470. default: type = LLM_TYPE_UNKNOWN;
  1471. }
  1472. } break;
  1473. case LLM_ARCH_T5ENCODER:
  1474. {
  1475. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1476. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  1477. type = LLM_TYPE_UNKNOWN;
  1478. } break;
  1479. case LLM_ARCH_JAIS:
  1480. {
  1481. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1482. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  1483. switch (hparams.n_layer) {
  1484. case 24: type = LLM_TYPE_1_3B; break;
  1485. case 40: type = LLM_TYPE_13B; break;
  1486. /* TODO: add variants */
  1487. default: type = LLM_TYPE_UNKNOWN;
  1488. }
  1489. } break;
  1490. case LLM_ARCH_NEMOTRON:
  1491. {
  1492. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1493. switch (hparams.n_layer) {
  1494. case 32: type = LLM_TYPE_4B; break;
  1495. default: type = LLM_TYPE_UNKNOWN;
  1496. }
  1497. } break;
  1498. case LLM_ARCH_NEMOTRON_H:
  1499. {
  1500. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1501. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1502. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1503. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1504. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1505. // A layer is recurrent IFF the n_head_kv value is set to 0 and
  1506. // the n_ff value is set to 0
  1507. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1508. hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
  1509. }
  1510. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1511. switch (hparams.n_layer) {
  1512. case 56: type = LLM_TYPE_9B; break;
  1513. default: type = LLM_TYPE_UNKNOWN;
  1514. }
  1515. } break;
  1516. case LLM_ARCH_EXAONE:
  1517. {
  1518. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1519. switch (hparams.n_layer) {
  1520. case 32: type = LLM_TYPE_8B; break;
  1521. default: type = LLM_TYPE_UNKNOWN;
  1522. }
  1523. } break;
  1524. case LLM_ARCH_EXAONE4:
  1525. {
  1526. if (hparams.n_layer == 64) { // 32B
  1527. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1528. hparams.n_swa = 4096;
  1529. hparams.set_swa_pattern(4);
  1530. }
  1531. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1532. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1533. switch (hparams.n_layer) {
  1534. case 30: type = LLM_TYPE_1_2B; break;
  1535. case 64: type = LLM_TYPE_32B; break;
  1536. default: type = LLM_TYPE_UNKNOWN;
  1537. }
  1538. } break;
  1539. case LLM_ARCH_RWKV6:
  1540. case LLM_ARCH_RWKV6QWEN2:
  1541. {
  1542. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
  1543. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
  1544. ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
  1545. ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
  1546. ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
  1547. ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
  1548. ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
  1549. switch (hparams.n_layer) {
  1550. case 24: type = LLM_TYPE_1_6B; break;
  1551. case 32:
  1552. switch (hparams.n_embd) {
  1553. case 2560: type = LLM_TYPE_3B; break;
  1554. case 4096: type = LLM_TYPE_7B; break;
  1555. default: type = LLM_TYPE_UNKNOWN;
  1556. } break;
  1557. case 61: type = LLM_TYPE_14B; break;
  1558. case 64: type = LLM_TYPE_32B; break;
  1559. default: type = LLM_TYPE_UNKNOWN;
  1560. }
  1561. } break;
  1562. case LLM_ARCH_RWKV7:
  1563. case LLM_ARCH_ARWKV7:
  1564. {
  1565. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
  1566. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
  1567. ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
  1568. ml.get_key(LLM_KV_ATTENTION_DECAY_LORA_RANK, hparams.n_lora_decay);
  1569. ml.get_key(LLM_KV_ATTENTION_ICLR_LORA_RANK, hparams.n_lora_iclr);
  1570. ml.get_key(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix);
  1571. ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate, false);
  1572. ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
  1573. switch (hparams.n_layer) {
  1574. case 12:
  1575. switch (hparams.n_embd) {
  1576. case 768: type = LLM_TYPE_190M; break;
  1577. default: type = LLM_TYPE_UNKNOWN;
  1578. } break;
  1579. case 24:
  1580. switch (hparams.n_embd) {
  1581. case 1024: type = LLM_TYPE_450M; break;
  1582. case 2048: type = LLM_TYPE_1_5B; break;
  1583. default: type = LLM_TYPE_UNKNOWN;
  1584. } break;
  1585. case 28:
  1586. switch (hparams.n_embd) {
  1587. case 1536: type = LLM_TYPE_1_5B; break;
  1588. case 3584: type = LLM_TYPE_7B; break;
  1589. default: type = LLM_TYPE_UNKNOWN;
  1590. } break;
  1591. case 32:
  1592. switch (hparams.n_embd) {
  1593. case 2560: type = LLM_TYPE_2_9B; break;
  1594. case 4096: type = LLM_TYPE_7B; break;
  1595. default: type = LLM_TYPE_UNKNOWN;
  1596. } break;
  1597. case 61:
  1598. switch (hparams.n_embd) {
  1599. case 4096: type = LLM_TYPE_14B; break;
  1600. default: type = LLM_TYPE_UNKNOWN;
  1601. } break;
  1602. default: type = LLM_TYPE_UNKNOWN;
  1603. }
  1604. } break;
  1605. case LLM_ARCH_GRANITE:
  1606. case LLM_ARCH_GRANITE_MOE:
  1607. {
  1608. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1609. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1610. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  1611. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  1612. ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
  1613. // Granite uses rope_finetuned as a switch for rope, so default to true
  1614. bool rope_finetuned = true;
  1615. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  1616. hparams.rope_finetuned = rope_finetuned;
  1617. switch (hparams.n_layer) {
  1618. case 32: type = LLM_TYPE_3B; break;
  1619. case 40: type = LLM_TYPE_3B; break;
  1620. // Add additional layer/vocab/etc checks here for other model sizes
  1621. default: type = LLM_TYPE_UNKNOWN;
  1622. }
  1623. // For Granite MoE Shared
  1624. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
  1625. } break;
  1626. case LLM_ARCH_GRANITE_HYBRID:
  1627. {
  1628. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1629. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /* required */ false);
  1630. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /* required */ false);
  1631. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /* required */ false);
  1632. ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, /* required */ false);
  1633. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1634. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1635. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1636. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1637. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1638. // Granite uses rope_finetuned as a switch for rope, so default to true
  1639. bool rope_finetuned = true;
  1640. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  1641. hparams.rope_finetuned = rope_finetuned;
  1642. // A layer is recurrent IFF the n_head_kv value is set to 0
  1643. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1644. hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
  1645. }
  1646. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1647. switch (hparams.n_layer) {
  1648. // TODO: Add llm type label (not sure this is useful)
  1649. default: type = LLM_TYPE_UNKNOWN;
  1650. }
  1651. // For Granite MoE Shared
  1652. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
  1653. } break;
  1654. case LLM_ARCH_QWEN3NEXT:
  1655. {
  1656. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  1657. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  1658. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1659. // Load linear attention (gated delta net) parameters
  1660. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1661. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1662. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1663. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1664. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1665. // Mark recurrent layers (linear attention layers)
  1666. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1667. hparams.recurrent_layer_arr[i] = ((i + 1) % 4 != 0); // TODO: extract the magic 4 from "full_attention_interval"
  1668. }
  1669. switch (hparams.n_layer) {
  1670. case 80: type = LLM_TYPE_80B_A3B; break;
  1671. default: type = LLM_TYPE_UNKNOWN;
  1672. }
  1673. } break;
  1674. case LLM_ARCH_CHAMELEON:
  1675. {
  1676. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1677. hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
  1678. ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
  1679. switch (hparams.n_layer) {
  1680. case 32: type = LLM_TYPE_7B; break;
  1681. case 48: type = LLM_TYPE_34B; break;
  1682. default: type = LLM_TYPE_UNKNOWN;
  1683. }
  1684. } break;
  1685. case LLM_ARCH_WAVTOKENIZER_DEC:
  1686. {
  1687. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1688. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
  1689. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
  1690. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  1691. } break;
  1692. case LLM_ARCH_BAILINGMOE:
  1693. {
  1694. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1695. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1696. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1697. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1698. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1699. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1700. switch (hparams.n_layer) {
  1701. case 28: type = LLM_TYPE_16B; break;
  1702. case 88: type = LLM_TYPE_290B; break;
  1703. default: type = LLM_TYPE_UNKNOWN;
  1704. }
  1705. } break;
  1706. case LLM_ARCH_DOTS1:
  1707. {
  1708. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1709. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1710. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1711. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1712. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1713. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1714. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1715. switch (hparams.n_layer) {
  1716. case 62: type = LLM_TYPE_142B; break;
  1717. default: type = LLM_TYPE_UNKNOWN;
  1718. }
  1719. } break;
  1720. case LLM_ARCH_ERNIE4_5:
  1721. case LLM_ARCH_ERNIE4_5_MOE:
  1722. {
  1723. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1724. if (arch == LLM_ARCH_ERNIE4_5_MOE) {
  1725. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1726. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  1727. ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
  1728. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1729. }
  1730. switch (hparams.n_layer) {
  1731. case 18: type = LLM_TYPE_0_3B; break;
  1732. case 28: type = LLM_TYPE_21B_A3B; break;
  1733. case 54: type = LLM_TYPE_300B_A47B; break;
  1734. default: type = LLM_TYPE_UNKNOWN;
  1735. }
  1736. } break;
  1737. case LLM_ARCH_FALCON_H1:
  1738. {
  1739. // Common parameters
  1740. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1741. // SSM parameters
  1742. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1743. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1744. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1745. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1746. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1747. std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
  1748. switch (hparams.n_layer) {
  1749. case 36:
  1750. type = LLM_TYPE_0_5B; break;
  1751. case 24:
  1752. type = LLM_TYPE_1_5B; break;
  1753. case 66:
  1754. type = LLM_TYPE_1B; break;
  1755. case 32:
  1756. type = LLM_TYPE_3B; break;
  1757. case 44:
  1758. type = LLM_TYPE_7B; break;
  1759. case 72:
  1760. type = LLM_TYPE_34B; break;
  1761. default:
  1762. type = LLM_TYPE_UNKNOWN;
  1763. }
  1764. } break;
  1765. case LLM_ARCH_HUNYUAN_MOE:
  1766. {
  1767. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1768. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1769. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
  1770. switch (hparams.n_layer) {
  1771. case 32: type = LLM_TYPE_A13B; break;
  1772. default: type = LLM_TYPE_UNKNOWN;
  1773. }
  1774. } break;
  1775. case LLM_ARCH_HUNYUAN_DENSE:
  1776. {
  1777. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1778. switch (hparams.n_embd) {
  1779. case 1024: type = LLM_TYPE_0_5B; break;
  1780. case 2048: type = LLM_TYPE_1_8B; break;
  1781. case 3072: type = LLM_TYPE_4B; break;
  1782. case 4096: type = LLM_TYPE_7B; break;
  1783. default: type = LLM_TYPE_UNKNOWN;
  1784. }
  1785. } break;
  1786. case LLM_ARCH_SMOLLM3:
  1787. {
  1788. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1789. hparams.n_no_rope_layer_step = 4;
  1790. switch (hparams.n_layer) {
  1791. case 36: type = LLM_TYPE_3B; break;
  1792. default: type = LLM_TYPE_UNKNOWN;
  1793. }
  1794. } break;
  1795. case LLM_ARCH_OPENAI_MOE:
  1796. {
  1797. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1798. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1799. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1800. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1801. hparams.set_swa_pattern(2);
  1802. switch (hparams.n_layer) {
  1803. case 24: type = LLM_TYPE_20B; break;
  1804. case 36: type = LLM_TYPE_120B; break;
  1805. default: type = LLM_TYPE_UNKNOWN;
  1806. }
  1807. } break;
  1808. case LLM_ARCH_LFM2:
  1809. {
  1810. ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
  1811. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1812. for (uint32_t il = 0; il < hparams.n_layer; ++il) {
  1813. hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
  1814. }
  1815. switch (hparams.n_embd) {
  1816. case 1024: type = LLM_TYPE_350M; break;
  1817. case 1536: type = LLM_TYPE_700M; break;
  1818. case 2048: type = LLM_TYPE_1_2B; break;
  1819. default: type = LLM_TYPE_UNKNOWN;
  1820. }
  1821. } break;
  1822. case LLM_ARCH_SMALLTHINKER:
  1823. {
  1824. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1825. if (found_swa && hparams.n_swa > 0) {
  1826. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1827. hparams.n_swa = 4096;
  1828. hparams.set_swa_pattern(4, true);
  1829. } else {
  1830. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  1831. hparams.n_no_rope_layer_step = hparams.n_layer;
  1832. }
  1833. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  1834. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1835. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1836. switch (hparams.n_layer) {
  1837. case 32: type = LLM_TYPE_4B; break;
  1838. case 52: type = LLM_TYPE_20B; break;
  1839. default: type = LLM_TYPE_UNKNOWN;
  1840. }
  1841. } break;
  1842. default: throw std::runtime_error("unsupported model architecture");
  1843. }
  1844. pimpl->n_bytes = ml.n_bytes;
  1845. pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
  1846. if (hparams.f_max_alibi_bias > 0.0f) {
  1847. hparams.use_alibi = true;
  1848. }
  1849. hparams.rope_type = llama_model_rope_type(this);
  1850. }
  1851. void llama_model::load_vocab(llama_model_loader & ml) {
  1852. const auto kv = LLM_KV(arch);
  1853. vocab.load(ml, kv);
  1854. }
  1855. bool llama_model::load_tensors(llama_model_loader & ml) {
  1856. const auto & split_mode = params.split_mode;
  1857. const auto & n_gpu_layers = params.n_gpu_layers;
  1858. const auto & use_mlock = params.use_mlock;
  1859. const auto & tensor_split = params.tensor_split;
  1860. const int n_layer = hparams.n_layer;
  1861. const bool use_mmap_buffer = true;
  1862. LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
  1863. // build a list of buffer types for the CPU and GPU devices
  1864. pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts);
  1865. for (auto * dev : devices) {
  1866. buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
  1867. // add CPU buffer types as a fallback
  1868. buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
  1869. pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
  1870. }
  1871. // calculate the split points
  1872. bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
  1873. std::vector<float> splits(n_devices());
  1874. if (all_zero) {
  1875. // default split, by free memory
  1876. for (size_t i = 0; i < n_devices(); ++i) {
  1877. ggml_backend_dev_t dev = devices[i];
  1878. size_t total;
  1879. size_t free;
  1880. ggml_backend_dev_memory(dev, &free, &total);
  1881. splits[i] = free;
  1882. }
  1883. } else {
  1884. std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
  1885. }
  1886. // sum and normalize the splits to get the split points
  1887. float split_sum = 0.0f;
  1888. for (size_t i = 0; i < n_devices(); ++i) {
  1889. split_sum += splits[i];
  1890. splits[i] = split_sum;
  1891. }
  1892. for (size_t i = 0; i < n_devices(); ++i) {
  1893. splits[i] /= split_sum;
  1894. }
  1895. ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  1896. if (cpu_dev == nullptr) {
  1897. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  1898. }
  1899. const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
  1900. const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
  1901. auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
  1902. const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
  1903. if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
  1904. LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
  1905. return {cpu_dev, &pimpl->cpu_buft_list};
  1906. }
  1907. const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
  1908. auto * dev = devices.at(layer_gpu);
  1909. LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
  1910. return {dev, &pimpl->gpu_buft_list.at(dev)};
  1911. };
  1912. // assign the input layer
  1913. // there is very little benefit to offloading the input layer, so always keep it on the CPU
  1914. pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
  1915. // assign the repeating layers to the devices according to the splits
  1916. pimpl->dev_layer.resize(n_layer);
  1917. for (int il = 0; il < n_layer; ++il) {
  1918. pimpl->dev_layer[il] = get_layer_buft_list(il);
  1919. }
  1920. // assign the output layer
  1921. pimpl->dev_output = get_layer_buft_list(n_layer);
  1922. // one ggml context per buffer type
  1923. int max_n_tensors = ml.n_tensors;
  1924. max_n_tensors += 1; // duplicated output tensor
  1925. max_n_tensors += n_layer*2; // duplicated rope freq tensors
  1926. const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
  1927. std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
  1928. auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
  1929. auto it = ctx_map.find(buft);
  1930. if (it == ctx_map.end()) {
  1931. ggml_init_params params = {
  1932. /*.mem_size =*/ ctx_size,
  1933. /*.mem_buffer =*/ NULL,
  1934. /*.no_alloc =*/ true,
  1935. };
  1936. ggml_context * ctx = ggml_init(params);
  1937. if (!ctx) {
  1938. throw std::runtime_error(format("failed to create ggml context"));
  1939. }
  1940. ctx_map[buft] = ctx;
  1941. pimpl->ctxs.emplace_back(ctx);
  1942. return ctx;
  1943. }
  1944. return it->second;
  1945. };
  1946. const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
  1947. const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
  1948. const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
  1949. // create tensors for the weights
  1950. {
  1951. // note: cast to int64_t since we will use these for the tensor dimensions
  1952. const int64_t n_head = hparams.n_head();
  1953. const int64_t n_head_kv = hparams.n_head_kv();
  1954. const int64_t n_embd = hparams.n_embd;
  1955. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
  1956. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
  1957. const int64_t n_embd_head_k = hparams.n_embd_head_k;
  1958. const int64_t n_embd_head_v = hparams.n_embd_head_v;
  1959. const int64_t n_ff = hparams.n_ff();
  1960. const int64_t n_embd_gqa = n_embd_v_gqa;
  1961. const int64_t n_vocab = vocab.n_tokens();
  1962. const int64_t n_token_types = vocab.n_token_types();
  1963. const int64_t n_rot = hparams.n_rot;
  1964. const int64_t n_expert = hparams.n_expert;
  1965. const int64_t n_expert_used = hparams.n_expert_used;
  1966. const int64_t n_ctx_train = hparams.n_ctx_train;
  1967. if (n_expert > 0 && hparams.n_expert_used == 0) {
  1968. throw std::runtime_error("model has expert layers but no expert layers are used");
  1969. }
  1970. int n_moved_tensors = 0;
  1971. ggml_tensor * first_moved_tensor = nullptr;
  1972. ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
  1973. ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
  1974. auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
  1975. ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
  1976. if (!t_meta) {
  1977. if (flags & TENSOR_NOT_REQUIRED) {
  1978. return nullptr;
  1979. }
  1980. throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
  1981. }
  1982. // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
  1983. // the tensor is duplicated
  1984. // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
  1985. llm_tensor tn_tensor = tn.tensor;
  1986. if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
  1987. tn_tensor = LLM_TENSOR_OUTPUT;
  1988. }
  1989. llm_tensor_info info;
  1990. try {
  1991. info = llm_tensor_info_for(tn_tensor);
  1992. } catch (const std::out_of_range & e) {
  1993. throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
  1994. }
  1995. // skip unused tensors
  1996. if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
  1997. const size_t nbytes = ggml_nbytes(t_meta);
  1998. LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
  1999. ml.size_data -= nbytes;
  2000. ml.n_created++;
  2001. return nullptr;
  2002. }
  2003. // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
  2004. ggml_op op;
  2005. bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
  2006. if (bias) {
  2007. if (info.op == GGML_OP_MUL_MAT_ID) {
  2008. op = GGML_OP_ADD_ID;
  2009. } else {
  2010. op = GGML_OP_ADD;
  2011. }
  2012. } else {
  2013. op = info.op;
  2014. }
  2015. // sanity checks
  2016. if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
  2017. if (tn.bid != -1) {
  2018. GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
  2019. }
  2020. } else {
  2021. if (tn.bid == -1) {
  2022. GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
  2023. }
  2024. }
  2025. // select the buffer type for this tensor
  2026. buft_list_t * buft_list;
  2027. switch (info.layer) {
  2028. case LLM_TENSOR_LAYER_INPUT:
  2029. buft_list = pimpl->dev_input.buft_list;
  2030. break;
  2031. case LLM_TENSOR_LAYER_OUTPUT:
  2032. buft_list = pimpl->dev_output.buft_list;
  2033. break;
  2034. case LLM_TENSOR_LAYER_REPEATING:
  2035. buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
  2036. break;
  2037. default:
  2038. GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
  2039. }
  2040. ggml_backend_buffer_type_t buft = nullptr;
  2041. // check overrides
  2042. if (ml.tensor_buft_overrides) {
  2043. std::string tensor_name = tn.str();
  2044. for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
  2045. std::regex pattern(overrides->pattern);
  2046. if (std::regex_search(tensor_name, pattern)) {
  2047. if (overrides->buft == ggml_backend_cpu_buffer_type()) {
  2048. // when overriding to a CPU buffer, consider the extra buffer types
  2049. buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
  2050. } else {
  2051. buft = overrides->buft;
  2052. }
  2053. LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
  2054. tensor_name.c_str(),
  2055. ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
  2056. ggml_backend_buft_name(buft));
  2057. break;
  2058. }
  2059. }
  2060. }
  2061. if (!buft) {
  2062. buft = select_weight_buft(hparams, t_meta, op, *buft_list);
  2063. if (!buft) {
  2064. throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
  2065. }
  2066. }
  2067. // avoid using a host buffer when using mmap
  2068. auto * buft_dev = ggml_backend_buft_get_device(buft);
  2069. if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
  2070. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  2071. if (!cpu_dev) {
  2072. throw std::runtime_error("no CPU backend found");
  2073. }
  2074. buft = ggml_backend_dev_buffer_type(cpu_dev);
  2075. }
  2076. if (buft != buft_list->front().second) {
  2077. n_moved_tensors++;
  2078. if (!first_moved_tensor) {
  2079. first_moved_tensor = t_meta;
  2080. first_moved_from_buft = buft_list->front().second;
  2081. first_moved_to_buft = buft;
  2082. }
  2083. }
  2084. ggml_context * ctx = ctx_for_buft(buft);
  2085. // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
  2086. if (flags & TENSOR_DUPLICATED) {
  2087. ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
  2088. if (t) {
  2089. return t;
  2090. }
  2091. }
  2092. return ml.create_tensor(ctx, tn, ne, flags);
  2093. };
  2094. layers.resize(n_layer);
  2095. // TODO: move to a separate function
  2096. const auto tn = LLM_TN(arch);
  2097. switch (arch) {
  2098. case LLM_ARCH_LLAMA:
  2099. case LLM_ARCH_REFACT:
  2100. case LLM_ARCH_MINICPM:
  2101. case LLM_ARCH_GRANITE:
  2102. case LLM_ARCH_GRANITE_MOE:
  2103. {
  2104. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2105. // output
  2106. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2107. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2108. // if output is NULL, init from the input tok embed
  2109. if (output == NULL) {
  2110. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2111. }
  2112. for (int i = 0; i < n_layer; ++i) {
  2113. auto & layer = layers[i];
  2114. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2115. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2116. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2117. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2118. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2119. // optional bias tensors
  2120. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2121. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2122. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2123. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2124. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2125. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  2126. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2127. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2128. }
  2129. else {
  2130. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2131. }
  2132. if (n_expert == 0) {
  2133. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2134. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2135. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2136. // optional MLP bias
  2137. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2138. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2139. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2140. } else {
  2141. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2142. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
  2143. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  2144. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2145. // For Granite MoE Shared
  2146. if (hparams.n_ff_shexp > 0) {
  2147. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  2148. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  2149. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
  2150. }
  2151. }
  2152. }
  2153. } break;
  2154. case LLM_ARCH_QWEN3NEXT:
  2155. {
  2156. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2157. // output
  2158. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2159. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
  2160. // if output is NULL, init from the input tok embed
  2161. if (output == NULL) {
  2162. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
  2163. }
  2164. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2165. // Calculate dimensions from hyperparameters
  2166. const int64_t head_k_dim = hparams.ssm_d_state;
  2167. const int64_t head_v_dim = hparams.ssm_d_state;
  2168. const int64_t n_k_heads = hparams.ssm_n_group;
  2169. const int64_t n_v_heads = hparams.ssm_dt_rank;
  2170. const int64_t key_dim = head_k_dim * n_k_heads;
  2171. const int64_t value_dim = head_v_dim * n_v_heads;
  2172. const int64_t conv_dim = key_dim * 2 + value_dim;
  2173. // Calculate projection sizes
  2174. const int64_t qkvz_projection_size = key_dim * 2 + value_dim * 2;
  2175. const int64_t ba_projection_size = n_v_heads * 2;
  2176. for (int i = 0; i < n_layer; ++i) {
  2177. auto & layer = layers[i];
  2178. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  2179. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
  2180. if (!hparams.is_recurrent(i)) {
  2181. // Attention layers
  2182. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
  2183. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
  2184. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
  2185. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
  2186. // Q/K normalization for attention layers
  2187. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
  2188. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
  2189. // attn gate
  2190. layer.wq_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
  2191. } else {
  2192. // Linear attention (gated delta net) specific tensors
  2193. // Create tensors with calculated dimensions
  2194. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_projection_size }, 0);
  2195. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
  2196. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
  2197. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), { hparams.ssm_dt_rank }, 0);
  2198. layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_projection_size }, 0);
  2199. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
  2200. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
  2201. }
  2202. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
  2203. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
  2204. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
  2205. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
  2206. // Shared experts
  2207. layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
  2208. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
  2209. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
  2210. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
  2211. }
  2212. }
  2213. break;
  2214. case LLM_ARCH_LLADA:
  2215. {
  2216. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2217. // output
  2218. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2219. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
  2220. // if output is NULL, init from the input tok embed
  2221. if (output == NULL) {
  2222. output =
  2223. create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
  2224. }
  2225. for (int i = 0; i < n_layer; ++i) {
  2226. auto & layer = layers[i];
  2227. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  2228. // Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
  2229. layer.wq =
  2230. create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
  2231. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
  2232. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
  2233. // No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
  2234. layer.wo =
  2235. create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
  2236. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
  2237. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  2238. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
  2239. TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2240. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
  2241. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  2242. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
  2243. // optional MLP bias
  2244. layer.ffn_gate_b =
  2245. create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
  2246. layer.ffn_down_b =
  2247. create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
  2248. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
  2249. }
  2250. }
  2251. break;
  2252. case LLM_ARCH_LLADA_MOE:
  2253. {
  2254. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2255. // output
  2256. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2257. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2258. GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe");
  2259. GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe");
  2260. for (int i = 0; i < n_layer; ++i) {
  2261. auto & layer = layers[i];
  2262. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2263. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2264. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2265. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2266. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2267. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2268. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2269. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2270. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2271. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2272. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2273. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2274. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2275. }
  2276. } break;
  2277. case LLM_ARCH_LLAMA4:
  2278. {
  2279. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2280. // output
  2281. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2282. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2283. // if output is NULL, init from the input tok embed
  2284. if (output == NULL) {
  2285. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2286. }
  2287. for (int i = 0; i < n_layer; ++i) {
  2288. bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
  2289. auto & layer = layers[i];
  2290. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2291. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2292. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2293. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2294. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2295. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2296. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2297. if (is_moe_layer) {
  2298. int n_ff_exp = hparams.n_ff_exp;
  2299. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2300. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  2301. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
  2302. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  2303. // Shared expert
  2304. const int64_t n_ff_shexp = n_ff_exp;
  2305. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2306. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
  2307. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2308. } else {
  2309. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2310. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2311. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2312. }
  2313. }
  2314. } break;
  2315. case LLM_ARCH_DECI:
  2316. {
  2317. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2318. // output
  2319. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2320. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2321. // if output is NULL, init from the input tok embed
  2322. if (output == NULL) {
  2323. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2324. }
  2325. for (int i = 0; i < n_layer; ++i) {
  2326. auto & layer = layers[i];
  2327. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
  2328. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
  2329. const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
  2330. const int64_t n_ff = hparams.n_ff(i);
  2331. const int64_t n_head = hparams.n_head(i);
  2332. const int64_t n_head_kv = hparams.n_head_kv(i);
  2333. if (n_head_kv == 0 && n_head > 0) {
  2334. // linear attention for DeciLMCausalModel
  2335. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2336. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2337. }
  2338. else if (n_head_kv > 0) {
  2339. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2340. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2341. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2342. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2343. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2344. }
  2345. // optional bias tensors
  2346. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2347. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2348. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2349. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2350. if (n_ff > 0) {
  2351. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2352. }
  2353. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  2354. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2355. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2356. }
  2357. else {
  2358. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2359. }
  2360. if (n_ff > 0) {
  2361. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2362. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2363. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2364. }
  2365. // optional MLP bias
  2366. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2367. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2368. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2369. }
  2370. } break;
  2371. case LLM_ARCH_MINICPM3:
  2372. {
  2373. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  2374. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  2375. const int64_t q_lora_rank = hparams.n_lora_q;
  2376. const int64_t kv_lora_rank = hparams.n_lora_kv;
  2377. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2378. // output
  2379. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2380. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2381. // if output is NULL, init from the input tok embed
  2382. if (output == NULL) {
  2383. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2384. }
  2385. for (int i = 0; i < n_layer; ++i) {
  2386. auto & layer = layers[i];
  2387. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2388. layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
  2389. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  2390. layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
  2391. layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
  2392. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
  2393. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
  2394. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
  2395. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2396. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2397. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2398. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2399. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2400. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2401. }
  2402. } break;
  2403. case LLM_ARCH_GROK:
  2404. {
  2405. if (n_expert == 0) {
  2406. throw std::runtime_error("Grok model cannot have zero experts");
  2407. }
  2408. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2409. // output
  2410. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2411. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2412. // if output is NULL, init from the input tok embed
  2413. if (output == NULL) {
  2414. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2415. }
  2416. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff/* / n_expert_used*/; // grok-1 n_ff_exp == n_ff
  2417. for (int i = 0; i < n_layer; ++i) {
  2418. auto & layer = layers[i];
  2419. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2420. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2421. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2422. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2423. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2424. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  2425. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2426. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2427. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, TENSOR_NOT_REQUIRED);
  2428. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2429. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2430. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
  2431. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2432. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  2433. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2434. if (!layer.ffn_post_norm) {
  2435. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  2436. }
  2437. }
  2438. } break;
  2439. case LLM_ARCH_DBRX:
  2440. {
  2441. if (n_expert == 0) {
  2442. throw std::runtime_error("DBRX model cannot have zero experts");
  2443. }
  2444. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2445. // output
  2446. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2447. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2448. for (int i = 0; i < n_layer; ++i) {
  2449. auto & layer = layers[i];
  2450. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2451. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2452. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2453. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  2454. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2455. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2456. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  2457. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2458. }
  2459. } break;
  2460. case LLM_ARCH_BAICHUAN:
  2461. {
  2462. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2463. {
  2464. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2465. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2466. }
  2467. for (int i = 0; i < n_layer; ++i) {
  2468. auto & layer = layers[i];
  2469. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2470. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2471. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2472. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2473. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2474. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2475. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2476. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2477. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2478. }
  2479. } break;
  2480. case LLM_ARCH_FALCON:
  2481. {
  2482. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2483. // output
  2484. {
  2485. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2486. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2487. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2488. if (!output) {
  2489. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
  2490. }
  2491. }
  2492. for (int i = 0; i < n_layer; ++i) {
  2493. auto & layer = layers[i];
  2494. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2495. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2496. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2497. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2498. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2499. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2500. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2501. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2502. }
  2503. } break;
  2504. case LLM_ARCH_STARCODER:
  2505. {
  2506. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2507. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  2508. // output
  2509. {
  2510. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2511. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2512. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2513. if (!output) {
  2514. // needs to be on GPU
  2515. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2516. }
  2517. }
  2518. for (int i = 0; i < n_layer; ++i) {
  2519. auto & layer = layers[i];
  2520. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2521. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2522. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2523. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2524. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2525. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2526. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2527. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2528. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2529. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2530. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2531. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2532. }
  2533. } break;
  2534. case LLM_ARCH_BERT:
  2535. case LLM_ARCH_NOMIC_BERT:
  2536. case LLM_ARCH_NOMIC_BERT_MOE:
  2537. case LLM_ARCH_JINA_BERT_V3:
  2538. {
  2539. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2540. type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
  2541. if (arch == LLM_ARCH_BERT) {
  2542. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  2543. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
  2544. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  2545. cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2546. cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2547. }
  2548. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  2549. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  2550. for (int i = 0; i < n_layer; ++i) {
  2551. auto & layer = layers[i];
  2552. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2553. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2554. if (!layer.wqkv) {
  2555. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2556. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2557. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2558. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2559. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2560. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2561. }
  2562. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2563. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2564. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  2565. layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
  2566. if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
  2567. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
  2568. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  2569. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2570. } else {
  2571. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2572. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2573. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2574. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2575. if (arch == LLM_ARCH_NOMIC_BERT) {
  2576. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2577. }
  2578. }
  2579. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  2580. layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
  2581. }
  2582. } break;
  2583. case LLM_ARCH_NEO_BERT:
  2584. {
  2585. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2586. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
  2587. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  2588. cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2589. cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2590. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2591. for (int i = 0; i < n_layer; ++i) {
  2592. auto & layer = layers[i];
  2593. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2594. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2595. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2596. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2597. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff*2}, 0);
  2598. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2599. }
  2600. } break;
  2601. case LLM_ARCH_JINA_BERT_V2:
  2602. {
  2603. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
  2604. type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
  2605. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
  2606. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); //LayerNorm bias
  2607. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
  2608. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {1}, TENSOR_NOT_REQUIRED);
  2609. for (int i = 0; i < n_layer; ++i) {
  2610. auto & layer = layers[i]; // JinaBertLayer
  2611. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2612. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2613. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2614. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2615. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2616. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2617. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2618. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2619. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2620. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2621. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
  2622. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
  2623. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
  2624. layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
  2625. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2626. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2627. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2628. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
  2629. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2630. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2631. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  2632. layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
  2633. }
  2634. } break;
  2635. case LLM_ARCH_BLOOM:
  2636. {
  2637. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2638. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  2639. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  2640. // output
  2641. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2642. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2643. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2644. // if output is NULL, init from the input tok embed
  2645. if (output == NULL) {
  2646. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2647. }
  2648. for (int i = 0; i < n_layer; ++i) {
  2649. auto & layer = layers[i];
  2650. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2651. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2652. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2653. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2654. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2655. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2656. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2657. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2658. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2659. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2660. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2661. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2662. }
  2663. } break;
  2664. case LLM_ARCH_MPT:
  2665. {
  2666. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2667. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
  2668. // output
  2669. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2670. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  2671. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2672. if (!output) {
  2673. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
  2674. }
  2675. for (int i = 0; i < n_layer; ++i) {
  2676. auto & layer = layers[i];
  2677. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2678. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2679. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2680. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2681. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2682. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2683. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2684. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2685. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2686. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2687. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2688. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2689. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2690. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2691. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2692. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2693. // AWQ ScaleActivation layer
  2694. layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2695. }
  2696. } break;
  2697. case LLM_ARCH_STABLELM:
  2698. {
  2699. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2700. // output
  2701. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2702. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2703. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2704. for (int i = 0; i < n_layer; ++i) {
  2705. auto & layer = layers[i];
  2706. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2707. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2708. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2709. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2710. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2711. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2712. // optional bias tensors, present in Stable LM 2 1.6B
  2713. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2714. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2715. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2716. // optional q and k layernorms, present in StableLM 2 12B
  2717. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
  2718. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
  2719. // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
  2720. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2721. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2722. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2723. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2724. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2725. }
  2726. } break;
  2727. case LLM_ARCH_QWEN:
  2728. {
  2729. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2730. // output
  2731. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2732. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2733. for (int i = 0; i < n_layer; ++i) {
  2734. auto & layer = layers[i];
  2735. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2736. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
  2737. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
  2738. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2739. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2740. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
  2741. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
  2742. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}, 0);
  2743. }
  2744. } break;
  2745. case LLM_ARCH_QWEN2:
  2746. case LLM_ARCH_QWEN2VL:
  2747. case LLM_ARCH_DREAM:
  2748. {
  2749. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2750. // output
  2751. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2752. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2753. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, TENSOR_NOT_REQUIRED);
  2754. // if output is NULL, init from the input tok embed
  2755. if (output == NULL) {
  2756. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2757. }
  2758. for (int i = 0; i < n_layer; ++i) {
  2759. auto & layer = layers[i];
  2760. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2761. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2762. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2763. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2764. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2765. // optional bias tensors
  2766. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2767. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2768. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2769. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2770. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2771. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2772. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2773. }
  2774. } break;
  2775. case LLM_ARCH_QWEN2MOE:
  2776. {
  2777. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2778. // output
  2779. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2780. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2781. for (int i = 0; i < n_layer; ++i) {
  2782. auto & layer = layers[i];
  2783. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2784. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2785. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2786. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2787. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2788. // optional bias tensors
  2789. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2790. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2791. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2792. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2793. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2794. if (n_expert == 0) {
  2795. throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
  2796. }
  2797. if (n_expert_used == 0) {
  2798. throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
  2799. }
  2800. // MoE branch
  2801. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2802. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2803. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2804. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2805. // Shared expert branch
  2806. const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
  2807. layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
  2808. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2809. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
  2810. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2811. }
  2812. } break;
  2813. case LLM_ARCH_QWEN3:
  2814. {
  2815. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2816. // output
  2817. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2818. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2819. // if output is NULL, init from the input tok embed
  2820. if (output == NULL) {
  2821. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2822. }
  2823. for (int i = 0; i < n_layer; ++i) {
  2824. auto & layer = layers[i];
  2825. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2826. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2827. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2828. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2829. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2830. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2831. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2832. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2833. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2834. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2835. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2836. }
  2837. } break;
  2838. case LLM_ARCH_QWEN3MOE:
  2839. {
  2840. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2841. // output
  2842. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2843. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2844. // if output is NULL, init from the input tok embed
  2845. if (output == NULL) {
  2846. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2847. }
  2848. for (int i = 0; i < n_layer; ++i) {
  2849. auto & layer = layers[i];
  2850. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2851. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2852. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2853. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2854. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2855. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2856. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2857. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2858. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2859. if (n_expert == 0) {
  2860. throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
  2861. }
  2862. if (n_expert_used == 0) {
  2863. throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
  2864. }
  2865. // MoE branch
  2866. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2867. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2868. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2869. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2870. }
  2871. } break;
  2872. case LLM_ARCH_PHI2:
  2873. {
  2874. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2875. // output
  2876. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2877. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2878. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2879. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, 0);
  2880. for (int i = 0; i < n_layer; ++i) {
  2881. auto & layer = layers[i];
  2882. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2883. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2884. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2885. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2886. if (layer.wqkv == nullptr) {
  2887. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2888. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2889. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2890. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2891. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2892. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2893. }
  2894. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2895. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2896. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2897. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2898. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2899. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2900. }
  2901. } break;
  2902. case LLM_ARCH_PHI3:
  2903. {
  2904. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2905. // output
  2906. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2907. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2908. // if output is NULL, init from the input tok embed
  2909. if (output == NULL) {
  2910. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2911. }
  2912. for (int i = 0; i < n_layer; ++i) {
  2913. auto & layer = layers[i];
  2914. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  2915. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
  2916. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  2917. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  2918. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  2919. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
  2920. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2921. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2922. }
  2923. } break;
  2924. case LLM_ARCH_PHIMOE:
  2925. {
  2926. const int64_t n_embd_head = n_embd / n_head;
  2927. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2928. // output
  2929. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2930. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2931. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
  2932. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0);
  2933. for (int i = 0; i < n_layer; ++i) {
  2934. auto & layer = layers[i];
  2935. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  2936. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
  2937. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
  2938. if (layer.wqkv == nullptr) {
  2939. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2940. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2941. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2942. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2943. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2944. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2945. }
  2946. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  2947. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
  2948. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  2949. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
  2950. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2951. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2952. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  2953. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2954. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2955. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2956. }
  2957. } break;
  2958. case LLM_ARCH_PLAMO:
  2959. {
  2960. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2961. // output
  2962. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2963. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2964. for (int i = 0; i < n_layer; ++i) {
  2965. auto & layer = layers[i];
  2966. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2967. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2968. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2969. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2970. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2971. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2972. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2973. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2974. }
  2975. } break;
  2976. case LLM_ARCH_PLAMO2:
  2977. {
  2978. const uint32_t d_conv = hparams.ssm_d_conv;
  2979. const uint32_t d_state = hparams.ssm_d_state;
  2980. const uint32_t num_heads = hparams.ssm_dt_rank;
  2981. const uint32_t intermediate_size = hparams.ssm_d_inner;
  2982. const uint32_t head_dim = intermediate_size / num_heads;
  2983. const uint32_t qk_dim = head_dim;
  2984. const uint32_t v_dim = head_dim;
  2985. const int64_t num_attention_heads = hparams.n_head();
  2986. const int64_t q_num_heads = num_attention_heads;
  2987. const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
  2988. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2989. // output
  2990. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2991. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2992. // if output is NULL, init from the input tok embed
  2993. if (output == NULL) {
  2994. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2995. }
  2996. for (int i = 0; i < n_layer; ++i) {
  2997. auto & layer = layers[i];
  2998. bool is_mamba_layer = hparams.is_recurrent(i);
  2999. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3000. if (is_mamba_layer) {
  3001. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2 * intermediate_size}, 0);
  3002. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, intermediate_size}, 0);
  3003. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {intermediate_size, dt_dim + 2*d_state}, 0);
  3004. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_dim, num_heads}, 0);
  3005. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {num_heads}, 0);
  3006. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {num_heads}, 0);
  3007. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {num_heads}, 0);
  3008. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {intermediate_size, n_embd}, 0);
  3009. layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, i), {dt_dim}, 0);
  3010. layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
  3011. layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
  3012. } else {
  3013. const int64_t num_key_value_heads = hparams.n_head_kv(i);
  3014. const int64_t k_num_heads = num_key_value_heads;
  3015. const int64_t v_num_heads = num_key_value_heads;
  3016. const int64_t q_proj_dim = q_num_heads * qk_dim;
  3017. const int64_t k_proj_dim = k_num_heads * qk_dim;
  3018. const int64_t v_proj_dim = v_num_heads * v_dim;
  3019. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
  3020. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim, num_attention_heads}, 0);
  3021. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim, k_num_heads}, 0);
  3022. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
  3023. }
  3024. // All layers have post-attention norm, FFN norm, and FFN tensors
  3025. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
  3026. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3027. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3028. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  3029. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
  3030. }
  3031. } break;
  3032. case LLM_ARCH_GPT2:
  3033. {
  3034. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3035. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  3036. // output
  3037. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3038. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3039. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3040. // if output is NULL, init from the input tok embed
  3041. if (output == NULL) {
  3042. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3043. }
  3044. for (int i = 0; i < n_layer; ++i) {
  3045. auto & layer = layers[i];
  3046. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3047. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3048. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3049. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  3050. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3051. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3052. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3053. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3054. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3055. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3056. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3057. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  3058. }
  3059. } break;
  3060. case LLM_ARCH_CODESHELL:
  3061. {
  3062. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3063. // if tok embd is NULL, init from output
  3064. if (tok_embd == NULL) {
  3065. tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3066. }
  3067. // output
  3068. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3069. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3070. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3071. for (int i = 0; i < n_layer; ++i) {
  3072. auto & layer = layers[i];
  3073. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3074. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3075. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3076. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  3077. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3078. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3079. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3080. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3081. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3082. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3083. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3084. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  3085. }
  3086. } break;
  3087. case LLM_ARCH_ORION:
  3088. {
  3089. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3090. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3091. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3092. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3093. for (int i = 0; i < n_layer; ++i) {
  3094. auto & layer = layers[i];
  3095. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3096. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3097. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3098. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3099. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3100. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3101. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3102. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3103. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3104. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3105. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3106. }
  3107. } break;
  3108. case LLM_ARCH_INTERNLM2:
  3109. {
  3110. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3111. // output
  3112. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3113. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3114. for (int i = 0; i < n_layer; ++i) {
  3115. auto & layer = layers[i];
  3116. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3117. // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3118. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3119. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3120. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3121. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3122. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3123. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3124. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3125. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3126. }
  3127. } break;
  3128. case LLM_ARCH_GEMMA:
  3129. {
  3130. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3131. // output
  3132. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3133. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  3134. for (int i = 0; i < n_layer; ++i) {
  3135. auto & layer = layers[i];
  3136. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3137. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3138. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3139. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3140. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3141. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3142. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3143. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3144. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3145. }
  3146. } break;
  3147. case LLM_ARCH_GEMMA2:
  3148. {
  3149. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3150. // output
  3151. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3152. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  3153. for (int i = 0; i < n_layer; ++i) {
  3154. auto & layer = layers[i];
  3155. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3156. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3157. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3158. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3159. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3160. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3161. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3162. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3163. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3164. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3165. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3166. }
  3167. } break;
  3168. case LLM_ARCH_GEMMA3:
  3169. case LLM_ARCH_GEMMA_EMBEDDING:
  3170. {
  3171. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3172. // output
  3173. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3174. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3175. // if output is NULL, init from the input tok embed
  3176. if (output == NULL) {
  3177. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3178. }
  3179. for (int i = 0; i < n_layer; ++i) {
  3180. auto & layer = layers[i];
  3181. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3182. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3183. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3184. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3185. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3186. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3187. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  3188. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  3189. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3190. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3191. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3192. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3193. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3194. }
  3195. } break;
  3196. case LLM_ARCH_GEMMA3N:
  3197. {
  3198. const int64_t n_altup = hparams.n_altup;
  3199. const int64_t laurel_rank = hparams.laurel_rank;
  3200. const int64_t n_embd_altup = hparams.n_embd_altup;
  3201. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3202. // if output is NULL, init from the input tok embed
  3203. if (output == NULL) {
  3204. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3205. }
  3206. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3207. tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0);
  3208. altup_proj = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
  3209. altup_unembd_proj = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
  3210. per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_altup * n_layer}, 0);
  3211. per_layer_proj_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM, "weight"), {n_embd_altup}, 0);
  3212. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3213. for (int i = 0; i < n_layer; ++i) {
  3214. auto & layer = layers[i];
  3215. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3216. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3217. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3218. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3219. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3220. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  3221. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  3222. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3223. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3224. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3225. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3226. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3227. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3228. // altup & laurel
  3229. layer.per_layer_inp_gate = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE, "weight", i), {n_embd, n_embd_altup}, 0);
  3230. layer.per_layer_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ, "weight", i), {n_embd_altup, n_embd}, 0);
  3231. layer.per_layer_post_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
  3232. layer.altup_correct_coef = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF, "weight", i), {n_altup, n_altup}, 0);
  3233. layer.altup_correct_scale = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0);
  3234. layer.altup_predict_coef = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF, "weight", i), {n_altup, n_altup * n_altup}, 0);
  3235. layer.altup_router = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER, "weight", i), {n_embd, n_altup}, 0);
  3236. layer.altup_router_norm = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM, "weight", i), {n_embd}, 0);
  3237. layer.laurel_l = create_tensor(tn(LLM_TENSOR_LAUREL_L, "weight", i), {n_embd, laurel_rank}, 0);
  3238. layer.laurel_r = create_tensor(tn(LLM_TENSOR_LAUREL_R, "weight", i), {laurel_rank, n_embd}, 0);
  3239. layer.laurel_post_norm = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM, "weight", i), {n_embd}, 0);
  3240. }
  3241. } break;
  3242. case LLM_ARCH_STARCODER2:
  3243. {
  3244. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3245. // output
  3246. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3247. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3248. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3249. // if output is NULL, init from the input tok embed
  3250. if (output == NULL) {
  3251. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3252. }
  3253. for (int i = 0; i < n_layer; ++i) {
  3254. auto & layer = layers[i];
  3255. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3256. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3257. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3258. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3259. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3260. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3261. // optional bias tensors
  3262. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  3263. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  3264. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  3265. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3266. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3267. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3268. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3269. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3270. // optional bias tensors
  3271. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3272. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}, 0);
  3273. }
  3274. } break;
  3275. case LLM_ARCH_MAMBA:
  3276. {
  3277. const int64_t d_conv = hparams.ssm_d_conv;
  3278. const int64_t d_inner = hparams.ssm_d_inner;
  3279. const int64_t d_state = hparams.ssm_d_state;
  3280. const int64_t dt_rank = hparams.ssm_dt_rank;
  3281. // only an expansion factor of 2 is supported for now
  3282. if (2 * n_embd != d_inner) {
  3283. throw std::runtime_error("only an expansion factor of 2 is supported for now");
  3284. }
  3285. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3286. // output
  3287. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3288. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3289. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3290. if (output == NULL) {
  3291. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3292. }
  3293. for (int i = 0; i < n_layer; ++i) {
  3294. auto & layer = layers[i];
  3295. // norm
  3296. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3297. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
  3298. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
  3299. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
  3300. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
  3301. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
  3302. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
  3303. // no "weight" suffix for these
  3304. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
  3305. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
  3306. // out_proj
  3307. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3308. }
  3309. } break;
  3310. case LLM_ARCH_MAMBA2:
  3311. {
  3312. const int64_t d_conv = hparams.ssm_d_conv;
  3313. const int64_t d_inner = hparams.ssm_d_inner;
  3314. const int64_t d_state = hparams.ssm_d_state;
  3315. const int64_t n_head = hparams.ssm_dt_rank;
  3316. const int64_t n_group = hparams.ssm_n_group;
  3317. const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
  3318. // only an expansion factor of 2 is supported for now
  3319. GGML_ASSERT(2 * n_embd == d_inner);
  3320. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3321. // output
  3322. {
  3323. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3324. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3325. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3326. if (output == NULL) {
  3327. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3328. }
  3329. }
  3330. for (int i = 0; i < n_layer; ++i) {
  3331. auto & layer = layers[i];
  3332. // norm
  3333. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3334. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
  3335. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
  3336. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);
  3337. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
  3338. // no "weight" suffix for these
  3339. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
  3340. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
  3341. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
  3342. // out_proj
  3343. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3344. }
  3345. } break;
  3346. case LLM_ARCH_JAMBA:
  3347. {
  3348. const int64_t d_conv = hparams.ssm_d_conv;
  3349. const int64_t d_inner = hparams.ssm_d_inner;
  3350. const int64_t d_state = hparams.ssm_d_state;
  3351. const int64_t dt_rank = hparams.ssm_dt_rank;
  3352. // only an expansion factor of 2 is supported for now
  3353. GGML_ASSERT(2 * n_embd == d_inner);
  3354. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3355. // output
  3356. {
  3357. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3358. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3359. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3360. if (output == NULL) {
  3361. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3362. }
  3363. }
  3364. for (int i = 0; i < n_layer; ++i) {
  3365. const int64_t n_head_kv = hparams.n_head_kv(i);
  3366. const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
  3367. auto & layer = layers[i];
  3368. // norm
  3369. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3370. if (n_head_kv == 0) {
  3371. // Mamba layer
  3372. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
  3373. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
  3374. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
  3375. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
  3376. layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, "weight", i), {dt_rank}, 0);
  3377. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
  3378. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
  3379. layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, "weight", i), {d_state}, 0);
  3380. layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, "weight", i), {d_state}, 0);
  3381. // no "weight" suffix for these
  3382. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
  3383. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
  3384. // out_proj
  3385. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3386. } else {
  3387. // Attention layers
  3388. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3389. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3390. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3391. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3392. }
  3393. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3394. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
  3395. if (layer.ffn_gate_inp) {
  3396. // MoE
  3397. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3398. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  3399. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3400. } else {
  3401. // FFN (no MoE)
  3402. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3403. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3404. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3405. }
  3406. }
  3407. } break;
  3408. case LLM_ARCH_GRANITE_HYBRID:
  3409. {
  3410. // mamba2 Mixer SSM params
  3411. // NOTE: int64_t for tensor dimensions
  3412. const int64_t d_conv = hparams.ssm_d_conv;
  3413. const int64_t d_inner = hparams.ssm_d_inner;
  3414. const int64_t d_state = hparams.ssm_d_state;
  3415. const int64_t n_ssm_head = hparams.ssm_dt_rank;
  3416. const int64_t n_group = hparams.ssm_n_group;
  3417. const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
  3418. // only an expansion factor of 2 is supported for now
  3419. GGML_ASSERT(2 * n_embd == d_inner);
  3420. // embeddings
  3421. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3422. // output
  3423. {
  3424. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3425. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3426. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3427. if (output == NULL) {
  3428. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3429. }
  3430. }
  3431. for (int i = 0; i < n_layer; ++i) {
  3432. auto & layer = layers[i];
  3433. // norm
  3434. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3435. if (hparams.is_recurrent(i)) {
  3436. // ssm layers
  3437. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
  3438. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
  3439. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
  3440. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
  3441. // no "weight" suffix for these
  3442. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
  3443. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
  3444. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
  3445. // out_proj
  3446. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3447. } else {
  3448. // attention layers (with optional bias)
  3449. const int64_t n_head_i = hparams.n_head(i);
  3450. const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
  3451. const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
  3452. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
  3453. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
  3454. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
  3455. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
  3456. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3457. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
  3458. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
  3459. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3460. }
  3461. // feed forward (w/ optional biases)
  3462. if (n_expert > 0) {
  3463. // MoE FFN
  3464. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3465. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3466. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3467. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
  3468. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  3469. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3470. // For Granite MoE Shared
  3471. if (hparams.n_ff_shexp > 0) {
  3472. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  3473. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  3474. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
  3475. }
  3476. } else {
  3477. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3478. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3479. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3480. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3481. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3482. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  3483. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3484. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  3485. }
  3486. }
  3487. } break;
  3488. case LLM_ARCH_XVERSE:
  3489. {
  3490. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3491. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3492. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3493. for (int i = 0; i < n_layer; ++i) {
  3494. auto & layer = layers[i];
  3495. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3496. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3497. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3498. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3499. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3500. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3501. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3502. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3503. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3504. }
  3505. } break;
  3506. case LLM_ARCH_COMMAND_R:
  3507. {
  3508. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3509. // output
  3510. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3511. // init output from the input tok embed
  3512. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3513. for (int i = 0; i < n_layer; ++i) {
  3514. auto & layer = layers[i];
  3515. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3516. if (n_layer >= 64){
  3517. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
  3518. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
  3519. }
  3520. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3521. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3522. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3523. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3524. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3525. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3526. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3527. }
  3528. } break;
  3529. case LLM_ARCH_COHERE2:
  3530. {
  3531. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  3532. // output
  3533. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  3534. // init output from the input tok embed
  3535. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
  3536. TENSOR_DUPLICATED);
  3537. for (int i = 0; i < n_layer; ++i) {
  3538. auto & layer = layers[i];
  3539. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  3540. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
  3541. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
  3542. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
  3543. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  3544. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
  3545. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  3546. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
  3547. }
  3548. }
  3549. break;
  3550. case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
  3551. {
  3552. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3553. // output
  3554. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3555. // if output is NULL, init from the input tok embed
  3556. if (output == NULL) {
  3557. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3558. }
  3559. for (int i = 0; i < n_layer; ++i) {
  3560. auto & layer = layers[i];
  3561. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3562. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3563. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3564. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3565. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3566. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3567. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3568. }
  3569. } break;
  3570. case LLM_ARCH_OLMO2:
  3571. {
  3572. const int64_t n_embd_head = n_embd / n_head;
  3573. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3574. // output
  3575. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3576. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3577. for (int i = 0; i < n_layer; ++i) {
  3578. auto & layer = layers[i];
  3579. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3580. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3581. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3582. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3583. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
  3584. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
  3585. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3586. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3587. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3588. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3589. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3590. }
  3591. } break;
  3592. case LLM_ARCH_SEED_OSS:
  3593. {
  3594. const uint32_t head_dim = hparams.n_embd_head_k;
  3595. const int64_t n_qo_dim = n_head * head_dim;
  3596. const int64_t n_kv_dim = n_head_kv * head_dim;
  3597. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3598. // output
  3599. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3600. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3601. // if output is NULL, init from the input tok embed
  3602. if (output == NULL) {
  3603. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3604. }
  3605. for (int i = 0; i < n_layer; ++i) {
  3606. auto & layer = layers[i];
  3607. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, 0);
  3608. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, 0);
  3609. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, 0);
  3610. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
  3611. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_qo_dim}, TENSOR_NOT_REQUIRED);
  3612. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
  3613. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
  3614. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3615. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3616. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3617. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3618. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3619. }
  3620. } break;
  3621. case LLM_ARCH_OLMOE:
  3622. {
  3623. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3624. // output
  3625. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3626. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3627. for (int i = 0; i < n_layer; ++i) {
  3628. auto & layer = layers[i];
  3629. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3630. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3631. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3632. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3633. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3634. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
  3635. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
  3636. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3637. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3638. if (n_expert == 0) {
  3639. throw std::runtime_error("n_expert must be > 0");
  3640. }
  3641. if (n_expert_used == 0) {
  3642. throw std::runtime_error("n_expert_used must be > 0");
  3643. }
  3644. // MoE branch
  3645. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3646. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  3647. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3648. }
  3649. } break;
  3650. case LLM_ARCH_OPENELM:
  3651. {
  3652. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3653. // output
  3654. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3655. // init output from the input tok embed
  3656. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3657. for (int i = 0; i < n_layer; ++i) {
  3658. const int64_t n_head = hparams.n_head(i);
  3659. const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head;
  3660. const int64_t n_ff = hparams.n_ff(i);
  3661. auto & layer = layers[i];
  3662. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3663. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
  3664. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  3665. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  3666. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
  3667. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3668. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3669. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3670. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3671. }
  3672. } break;
  3673. case LLM_ARCH_GPTNEOX:
  3674. {
  3675. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3676. // output
  3677. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3678. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3679. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3680. for (int i = 0; i < n_layer; ++i) {
  3681. auto & layer = layers[i];
  3682. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3683. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3684. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3685. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  3686. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3687. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3688. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3689. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3690. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3691. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3692. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3693. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  3694. }
  3695. } break;
  3696. case LLM_ARCH_ARCTIC:
  3697. {
  3698. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3699. // output
  3700. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3701. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3702. // if output is NULL, init from the input tok embed
  3703. if (output == NULL) {
  3704. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3705. }
  3706. for (int i = 0; i < n_layer; ++i) {
  3707. auto & layer = layers[i];
  3708. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3709. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3710. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3711. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3712. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3713. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3714. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
  3715. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
  3716. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd}, 0);
  3717. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3718. layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
  3719. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
  3720. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  3721. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3722. }
  3723. } break;
  3724. case LLM_ARCH_DEEPSEEK:
  3725. {
  3726. const int64_t n_ff_exp = hparams.n_ff_exp;
  3727. const int64_t n_expert_shared = hparams.n_expert_shared;
  3728. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3729. // output
  3730. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3731. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3732. for (int i = 0; i < n_layer; ++i) {
  3733. auto & layer = layers[i];
  3734. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3735. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3736. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3737. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3738. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3739. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3740. if (i < (int) hparams.n_layer_dense_lead) {
  3741. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3742. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3743. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3744. } else {
  3745. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3746. if (n_expert == 0) {
  3747. throw std::runtime_error("n_expert must be > 0");
  3748. }
  3749. if (n_expert_used == 0) {
  3750. throw std::runtime_error("n_expert_used must be > 0");
  3751. }
  3752. // MoE branch
  3753. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3754. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  3755. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3756. // Shared expert branch
  3757. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3758. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  3759. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3760. }
  3761. }
  3762. } break;
  3763. case LLM_ARCH_DEEPSEEK2:
  3764. {
  3765. const bool is_lite = (hparams.n_layer == 27);
  3766. const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
  3767. // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
  3768. const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
  3769. const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
  3770. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  3771. const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
  3772. const int64_t q_lora_rank = hparams.n_lora_q;
  3773. const int64_t kv_lora_rank = hparams.n_lora_kv;
  3774. const int64_t n_ff_exp = hparams.n_ff_exp;
  3775. const int64_t n_expert_shared = hparams.n_expert_shared;
  3776. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3777. // output
  3778. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3779. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3780. for (int i = 0; i < n_layer; ++i) {
  3781. auto & layer = layers[i];
  3782. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3783. if (!is_lite) {
  3784. layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
  3785. }
  3786. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  3787. if (!is_lite) {
  3788. layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
  3789. layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
  3790. } else {
  3791. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
  3792. }
  3793. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
  3794. // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
  3795. if (is_mla) {
  3796. layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
  3797. layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
  3798. } else {
  3799. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
  3800. }
  3801. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
  3802. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3803. if (i < (int) hparams.n_layer_dense_lead) {
  3804. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3805. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3806. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3807. } else {
  3808. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3809. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  3810. if (n_expert == 0) {
  3811. throw std::runtime_error("n_expert must be > 0");
  3812. }
  3813. if (n_expert_used == 0) {
  3814. throw std::runtime_error("n_expert_used must be > 0");
  3815. }
  3816. // MoE branch
  3817. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3818. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  3819. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3820. // Shared expert branch
  3821. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3822. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  3823. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3824. }
  3825. }
  3826. } break;
  3827. case LLM_ARCH_PLM:
  3828. {
  3829. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  3830. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  3831. const int64_t kv_lora_rank = hparams.n_lora_kv;
  3832. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3833. // output
  3834. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3835. // output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3836. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3837. for (int i = 0; i < n_layer; ++i) {
  3838. auto & layer = layers[i];
  3839. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3840. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3841. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
  3842. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  3843. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
  3844. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
  3845. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3846. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3847. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3848. }
  3849. } break;
  3850. case LLM_ARCH_BITNET:
  3851. {
  3852. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3853. // output
  3854. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3855. for (int i = 0; i < n_layer; ++i) {
  3856. auto & layer = layers[i];
  3857. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3858. layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
  3859. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3860. layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3861. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3862. layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3863. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3864. layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3865. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3866. layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3867. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3868. layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
  3869. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3870. layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3871. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3872. layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3873. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3874. layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3875. }
  3876. } break;
  3877. case LLM_ARCH_T5:
  3878. {
  3879. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  3880. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3881. // output
  3882. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3883. output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3884. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3885. // if output is NULL, init from the input tok embed
  3886. if (output == NULL) {
  3887. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3888. }
  3889. // n_layer: number of encoder_layers
  3890. // dec_n_layer: number of decoder_layers
  3891. const int dec_n_layer = hparams.dec_n_layer;
  3892. if (dec_n_layer > n_layer) {
  3893. layers.resize(dec_n_layer);
  3894. }
  3895. // load encoder layers
  3896. for (int i = 0; i < n_layer; ++i) {
  3897. auto & layer = layers[i];
  3898. layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
  3899. layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  3900. layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3901. layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3902. layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3903. layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  3904. layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
  3905. layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  3906. layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3907. layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3908. }
  3909. // load decoder layers
  3910. for (int i = 0; i < dec_n_layer; ++i) {
  3911. auto & layer = layers[i];
  3912. layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
  3913. layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  3914. layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3915. layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3916. layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3917. layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  3918. layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
  3919. // this tensor seems to be unused in HF transformers implementation
  3920. layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  3921. layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3922. layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3923. layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3924. layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  3925. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
  3926. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  3927. layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3928. layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3929. }
  3930. } break;
  3931. case LLM_ARCH_T5ENCODER:
  3932. {
  3933. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  3934. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3935. // output
  3936. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3937. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3938. // if output is NULL, init from the input tok embed
  3939. if (output == NULL) {
  3940. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3941. }
  3942. for (int i = 0; i < n_layer; ++i) {
  3943. auto & layer = layers[i];
  3944. layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
  3945. layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  3946. layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3947. layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3948. layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3949. layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  3950. layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
  3951. layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  3952. layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3953. layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3954. }
  3955. } break;
  3956. case LLM_ARCH_JAIS:
  3957. {
  3958. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3959. // output
  3960. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3961. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3962. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3963. for (int i = 0; i < n_layer; ++i) {
  3964. auto & layer = layers[i];
  3965. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3966. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3967. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3968. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  3969. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3970. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3971. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3972. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3973. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3974. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3975. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3976. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, 0);
  3977. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3978. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  3979. }
  3980. } break;
  3981. case LLM_ARCH_CHATGLM:
  3982. {
  3983. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3984. // output
  3985. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3986. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3987. // if output is NULL, init from the input tok embed
  3988. if (output == NULL) {
  3989. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3990. }
  3991. for (int i = 0; i < n_layer; ++i) {
  3992. auto & layer = layers[i];
  3993. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3994. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3995. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3996. if (layer.wqkv == nullptr) {
  3997. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3998. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3999. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4000. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4001. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4002. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4003. }
  4004. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4005. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4006. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  4007. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  4008. }
  4009. } break;
  4010. case LLM_ARCH_GLM4:
  4011. {
  4012. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4013. // output
  4014. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4015. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4016. // if output is NULL, init from the input tok embed
  4017. if (output == NULL) {
  4018. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4019. }
  4020. for (int i = 0; i < n_layer; ++i) {
  4021. auto & layer = layers[i];
  4022. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4023. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4024. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4025. if (layer.wqkv == nullptr) {
  4026. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4027. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4028. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4029. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4030. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4031. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4032. }
  4033. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4034. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  4035. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4036. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4037. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  4038. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  4039. }
  4040. } break;
  4041. case LLM_ARCH_GLM4_MOE:
  4042. {
  4043. const int64_t n_expert = hparams.n_expert;
  4044. const int64_t n_expert_used = hparams.n_expert_used;
  4045. const int64_t n_expert_shared = hparams.n_expert_shared;
  4046. GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
  4047. GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
  4048. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  4049. // output
  4050. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  4051. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
  4052. // if output is NULL, init from the input tok embed
  4053. if (output == NULL) {
  4054. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
  4055. }
  4056. // Load ALL tensors including NextN layer to satisfy total tensor count
  4057. // but only PROCESS up to last layer (skipping final NextN layer) in forward pass
  4058. for (int i = 0; i < n_layer; ++i) {
  4059. int flags = 0;
  4060. if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
  4061. // skip all tensors in the NextN layers
  4062. flags |= TENSOR_SKIP;
  4063. }
  4064. auto & layer = layers[i];
  4065. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
  4066. // GLM-style attention with bias terms
  4067. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
  4068. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
  4069. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
  4070. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
  4071. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
  4072. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
  4073. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
  4074. // K/Q norm tensors (optional for GLM-4.5 355B variant)
  4075. layer.attn_q_norm = create_tensor(
  4076. tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
  4077. layer.attn_k_norm = create_tensor(
  4078. tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
  4079. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
  4080. // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
  4081. // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
  4082. const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
  4083. if (use_moe) {
  4084. // MoE layers
  4085. layer.ffn_gate_inp =
  4086. create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
  4087. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
  4088. // MoE branch
  4089. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  4090. layer.ffn_gate_exps = create_tensor(
  4091. tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
  4092. layer.ffn_down_exps = create_tensor(
  4093. tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
  4094. layer.ffn_up_exps = create_tensor(
  4095. tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
  4096. // Shared expert
  4097. if (n_expert_shared > 0) {
  4098. const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
  4099. layer.ffn_gate_shexp = create_tensor(
  4100. tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
  4101. layer.ffn_down_shexp = create_tensor(
  4102. tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
  4103. layer.ffn_up_shexp = create_tensor(
  4104. tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
  4105. }
  4106. } else {
  4107. // Dense layers (first k layers) - GLM uses separate gate/up projections
  4108. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
  4109. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
  4110. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags);
  4111. }
  4112. // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
  4113. if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
  4114. layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
  4115. layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags);
  4116. layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
  4117. layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
  4118. layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags);
  4119. layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags);
  4120. }
  4121. }
  4122. }
  4123. break;
  4124. case LLM_ARCH_NEMOTRON:
  4125. {
  4126. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4127. // output
  4128. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4129. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4130. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4131. for (int i = 0; i < n_layer; ++i) {
  4132. auto & layer = layers[i];
  4133. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4134. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  4135. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  4136. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  4137. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  4138. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4139. // optional bias tensors
  4140. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4141. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4142. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4143. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4144. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4145. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  4146. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4147. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4148. // optional MLP bias
  4149. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4150. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  4151. }
  4152. } break;
  4153. case LLM_ARCH_NEMOTRON_H:
  4154. {
  4155. // mamba2 Mixer SSM params
  4156. // NOTE: int64_t for tensor dimensions
  4157. const int64_t d_conv = hparams.ssm_d_conv;
  4158. const int64_t d_inner = hparams.ssm_d_inner;
  4159. const int64_t d_state = hparams.ssm_d_state;
  4160. const int64_t n_ssm_head = hparams.ssm_dt_rank;
  4161. const int64_t n_group = hparams.ssm_n_group;
  4162. const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
  4163. // embeddings
  4164. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4165. // output
  4166. {
  4167. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4168. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4169. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  4170. if (output == NULL) {
  4171. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4172. }
  4173. }
  4174. for (int i = 0; i < n_layer; ++i) {
  4175. auto & layer = layers[i];
  4176. // all blocks use the attn norm
  4177. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4178. if (hparams.is_recurrent(i)) {
  4179. // ssm layers
  4180. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
  4181. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
  4182. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
  4183. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
  4184. // no "weight" suffix for these
  4185. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
  4186. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
  4187. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
  4188. // out_proj
  4189. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  4190. } else if (hparams.n_ff(i) == 0) {
  4191. // attention layers (with optional bias)
  4192. const int64_t n_head_i = hparams.n_head(i);
  4193. const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
  4194. const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
  4195. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
  4196. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
  4197. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
  4198. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
  4199. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4200. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
  4201. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
  4202. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4203. } else {
  4204. // mlp layers
  4205. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
  4206. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
  4207. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4208. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
  4209. }
  4210. }
  4211. } break;
  4212. case LLM_ARCH_EXAONE:
  4213. {
  4214. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4215. // output
  4216. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4217. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4218. // if output is NULL, init from the input tok embed
  4219. if (output == NULL) {
  4220. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4221. }
  4222. for (int i = 0; i < n_layer; ++i) {
  4223. auto & layer = layers[i];
  4224. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4225. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4226. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4227. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4228. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4229. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4230. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4231. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4232. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4233. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4234. }
  4235. } break;
  4236. case LLM_ARCH_EXAONE4:
  4237. {
  4238. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4239. // output
  4240. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4241. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4242. // if output is NULL, init from the input tok embed
  4243. if (output == NULL) {
  4244. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4245. }
  4246. for (int i = 0; i < n_layer; ++i) {
  4247. auto & layer = layers[i];
  4248. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4249. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4250. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4251. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4252. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4253. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  4254. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4255. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4256. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4257. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4258. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4259. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  4260. }
  4261. } break;
  4262. case LLM_ARCH_RWKV6:
  4263. {
  4264. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4265. // Block 0, LN0
  4266. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  4267. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  4268. // output
  4269. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4270. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4271. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4272. const int time_mix_extra_dim = hparams.time_mix_extra_dim;
  4273. const int time_decay_extra_dim = hparams.time_decay_extra_dim;
  4274. const int head_size = hparams.wkv_head_size;
  4275. const int attn_hidden_size = n_embd;
  4276. const int ffn_size = hparams.n_ff_arr[0];
  4277. for (int i = 0; i < n_layer; ++i) {
  4278. auto & layer = layers[i];
  4279. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4280. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  4281. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
  4282. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
  4283. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
  4284. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
  4285. layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
  4286. layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4287. layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4288. layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4289. layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4290. layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4291. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED);
  4292. GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
  4293. layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
  4294. layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
  4295. layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
  4296. layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
  4297. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  4298. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4299. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4300. layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4301. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
  4302. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
  4303. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4304. layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
  4305. layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
  4306. layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
  4307. layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
  4308. layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
  4309. }
  4310. } break;
  4311. case LLM_ARCH_RWKV6QWEN2:
  4312. {
  4313. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4314. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4315. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  4316. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4317. const int time_mix_extra_dim = hparams.time_mix_extra_dim;
  4318. const int time_decay_extra_dim = hparams.time_decay_extra_dim;
  4319. const int head_size = hparams.wkv_head_size;
  4320. const int attn_hidden_size = n_embd;
  4321. const int n_head_kv = hparams.n_head_kv();
  4322. int attn_key_value_size;
  4323. if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
  4324. attn_key_value_size = attn_hidden_size;
  4325. } else {
  4326. attn_key_value_size = n_head_kv * head_size;
  4327. }
  4328. for (int i = 0; i < n_layer; ++i) {
  4329. auto & layer = layers[i];
  4330. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4331. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
  4332. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
  4333. layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
  4334. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
  4335. layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, TENSOR_NOT_REQUIRED);
  4336. layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
  4337. layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
  4338. layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
  4339. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
  4340. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
  4341. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4342. layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4343. // optional bias tensors
  4344. layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
  4345. layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
  4346. layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, TENSOR_NOT_REQUIRED);
  4347. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4348. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4349. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4350. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4351. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4352. }
  4353. } break;
  4354. case LLM_ARCH_RWKV7:
  4355. {
  4356. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4357. // Block 0, LN0
  4358. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  4359. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  4360. // output
  4361. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4362. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4363. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4364. const int n_lora_decay = hparams.n_lora_decay;
  4365. const int n_lora_iclr = hparams.n_lora_iclr;
  4366. const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
  4367. const int n_lora_gate = hparams.n_lora_gate;
  4368. const int attn_hidden_size = n_embd;
  4369. const int ffn_size = hparams.n_ff_arr[0];
  4370. for (int i = 0; i < n_layer; ++i) {
  4371. auto & layer = layers[i];
  4372. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4373. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  4374. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
  4375. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
  4376. layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
  4377. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
  4378. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
  4379. layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
  4380. layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4381. layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4382. if (i == 0) {
  4383. // actually not used
  4384. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4385. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4386. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4387. } else {
  4388. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4389. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
  4390. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
  4391. }
  4392. layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, 0);
  4393. layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, 0);
  4394. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
  4395. layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
  4396. layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
  4397. layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
  4398. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  4399. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4400. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4401. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
  4402. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
  4403. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4404. layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
  4405. layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
  4406. layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
  4407. }
  4408. } break;
  4409. case LLM_ARCH_ARWKV7:
  4410. {
  4411. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4412. // output
  4413. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4414. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4415. const int n_lora_decay = hparams.n_lora_decay;
  4416. const int n_lora_iclr = hparams.n_lora_iclr;
  4417. const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
  4418. const int n_lora_gate = hparams.n_lora_gate;
  4419. const int attn_hidden_size = n_embd;
  4420. for (int i = 0; i < n_layer; ++i) {
  4421. auto & layer = layers[i];
  4422. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4423. layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
  4424. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
  4425. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
  4426. layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
  4427. layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4428. layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4429. if (i == 0) {
  4430. // actually not used
  4431. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4432. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4433. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4434. } else {
  4435. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4436. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
  4437. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
  4438. }
  4439. layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, TENSOR_NOT_REQUIRED);
  4440. layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, TENSOR_NOT_REQUIRED);
  4441. try {
  4442. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
  4443. } catch(std::runtime_error & e) {
  4444. // ARWKV models may not have gate tensors
  4445. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
  4446. }
  4447. layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
  4448. layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
  4449. layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
  4450. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  4451. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4452. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4453. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4454. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4455. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4456. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4457. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4458. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4459. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4460. }
  4461. } break;
  4462. case LLM_ARCH_CHAMELEON:
  4463. {
  4464. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4465. // output
  4466. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4467. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4468. // if output is NULL, init from the input tok embed
  4469. if (output == NULL) {
  4470. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4471. }
  4472. for (int i = 0; i < n_layer; ++i) {
  4473. auto & layer = layers[i];
  4474. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4475. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
  4476. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
  4477. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
  4478. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
  4479. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  4480. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  4481. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  4482. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4483. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4484. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4485. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4486. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4487. }
  4488. } break;
  4489. case LLM_ARCH_WAVTOKENIZER_DEC:
  4490. {
  4491. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
  4492. conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
  4493. conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
  4494. // posnet
  4495. {
  4496. const int64_t n_embd = hparams.posnet.n_embd;
  4497. for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
  4498. auto & layer = layers[i].posnet;
  4499. // posnet:
  4500. //
  4501. // - resnet
  4502. // - resnet
  4503. // - attn
  4504. // - resnet
  4505. // - resnet
  4506. // - norm
  4507. //
  4508. switch (i) {
  4509. case 0:
  4510. case 1:
  4511. case 3:
  4512. case 4:
  4513. {
  4514. layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
  4515. layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
  4516. layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
  4517. layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
  4518. layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
  4519. layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
  4520. layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
  4521. layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
  4522. } break;
  4523. case 2:
  4524. {
  4525. layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
  4526. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
  4527. layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
  4528. layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
  4529. layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
  4530. layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
  4531. layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
  4532. layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
  4533. layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
  4534. layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
  4535. } break;
  4536. case 5:
  4537. {
  4538. layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
  4539. layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
  4540. } break;
  4541. default: GGML_ABORT("unknown posnet layer");
  4542. };
  4543. }
  4544. }
  4545. GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
  4546. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
  4547. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
  4548. // convnext
  4549. {
  4550. const int64_t n_embd = hparams.convnext.n_embd;
  4551. for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
  4552. auto & layer = layers[i].convnext;
  4553. layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
  4554. layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
  4555. layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
  4556. layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
  4557. layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
  4558. layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
  4559. layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
  4560. layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
  4561. layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
  4562. }
  4563. // output
  4564. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4565. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4566. }
  4567. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
  4568. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
  4569. } break;
  4570. case LLM_ARCH_BAILINGMOE:
  4571. {
  4572. const int64_t n_ff_exp = hparams.n_ff_exp;
  4573. const int64_t n_expert_shared = hparams.n_expert_shared;
  4574. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4575. // output
  4576. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4577. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4578. for (int i = 0; i < n_layer; ++i) {
  4579. auto & layer = layers[i];
  4580. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4581. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
  4582. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  4583. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  4584. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
  4585. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4586. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4587. if (n_expert == 0) {
  4588. throw std::runtime_error("n_expert must be > 0");
  4589. }
  4590. if (n_expert_used == 0) {
  4591. throw std::runtime_error("n_expert_used must be > 0");
  4592. }
  4593. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4594. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  4595. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4596. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4597. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  4598. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4599. }
  4600. } break;
  4601. case LLM_ARCH_DOTS1:
  4602. {
  4603. const int64_t n_ff_exp = hparams.n_ff_exp;
  4604. const int64_t n_expert_shared = hparams.n_expert_shared;
  4605. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4606. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4607. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4608. for (int i = 0; i < n_layer; ++i) {
  4609. auto & layer = layers[i];
  4610. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4611. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4612. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4613. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4614. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4615. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4616. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4617. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4618. if (i < (int) hparams.n_layer_dense_lead) {
  4619. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4620. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4621. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4622. } else {
  4623. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4624. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  4625. if (n_expert == 0) {
  4626. throw std::runtime_error("n_expert must be > 0");
  4627. }
  4628. if (n_expert_used == 0) {
  4629. throw std::runtime_error("n_expert_used must be > 0");
  4630. }
  4631. // MoE branch
  4632. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4633. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  4634. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4635. // Shared expert branch
  4636. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4637. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  4638. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4639. }
  4640. }
  4641. } break;
  4642. case LLM_ARCH_ARCEE:
  4643. {
  4644. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4645. // output
  4646. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4647. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4648. // if output is NULL, init from the input tok embed
  4649. if (output == NULL) {
  4650. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4651. }
  4652. for (int i = 0; i < n_layer; ++i) {
  4653. auto & layer = layers[i];
  4654. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4655. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4656. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4657. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4658. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4659. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4660. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4661. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4662. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4663. }
  4664. } break;
  4665. case LLM_ARCH_ERNIE4_5:
  4666. case LLM_ARCH_ERNIE4_5_MOE:
  4667. {
  4668. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4669. // output
  4670. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4671. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4672. // if output is NULL, init from the input tok embed
  4673. if (output == NULL) {
  4674. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4675. }
  4676. for (int i = 0; i < n_layer; ++i) {
  4677. auto & layer = layers[i];
  4678. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4679. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4680. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  4681. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  4682. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4683. // optional bias tensors
  4684. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4685. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4686. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4687. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4688. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4689. if (arch == LLM_ARCH_ERNIE4_5_MOE && static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
  4690. int n_ff_exp = hparams.n_ff_exp;
  4691. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4692. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  4693. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
  4694. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
  4695. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  4696. // Shared expert (if present)
  4697. if (hparams.n_ff_shexp > 0) {
  4698. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
  4699. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd }, 0);
  4700. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
  4701. }
  4702. } else { // Dense layers
  4703. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4704. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4705. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4706. }
  4707. }
  4708. } break;
  4709. case LLM_ARCH_FALCON_H1:
  4710. {
  4711. // Common
  4712. const int64_t hidden_size = hparams.n_embd; // hidden_size
  4713. // mamba2 Mixer SSM params
  4714. const int64_t ssm_conv_kernel_size = hparams.ssm_d_conv; // ssm_conv_kernel_size
  4715. const int64_t ssm_n_groups = hparams.ssm_n_group; // ssm_n_groups
  4716. const int64_t ssm_state_size = hparams.ssm_d_state; // ssm_state_size
  4717. const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand
  4718. const int64_t ssm_num_heads = hparams.ssm_dt_rank; // ssm_num_heads
  4719. const int64_t ssm_conv_dim = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size;
  4720. const int64_t ssm_projection_size = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads;
  4721. // attn params
  4722. const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head
  4723. const int64_t attn_num_key_value_head = hparams.n_head_kv(0);
  4724. // ffn params
  4725. const int64_t ffn_intermediate_size = hparams.n_ff(0);
  4726. // embeddings
  4727. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0);
  4728. // output
  4729. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED);
  4730. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0);
  4731. // if output is NULL, init from the input tok embed
  4732. if (output == NULL) {
  4733. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED);
  4734. }
  4735. for (int i = 0; i < n_layer; ++i) {
  4736. auto & layer = layers[i];
  4737. /*SSM LAYERS*/
  4738. // ssm in
  4739. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0);
  4740. // ssm 1d conv
  4741. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0);
  4742. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED);
  4743. // ssm_dt
  4744. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0);
  4745. // no "weight" suffix for these
  4746. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0);
  4747. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0);
  4748. // ssm_norm
  4749. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED);
  4750. // out_proj
  4751. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0);
  4752. /*ATTENTION LAYERS*/
  4753. // attention layers (with optional bias)
  4754. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {hidden_size, n_embd_head_k * attn_num_attention_head}, 0);
  4755. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0);
  4756. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0);
  4757. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
  4758. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
  4759. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {attn_num_key_value_head * n_embd_head_k}, TENSOR_NOT_REQUIRED);
  4760. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {attn_num_key_value_head * n_embd_head_v}, TENSOR_NOT_REQUIRED);
  4761. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
  4762. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
  4763. // feed forward (w/ optional biases)
  4764. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0);
  4765. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4766. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
  4767. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_intermediate_size, hidden_size}, 0);
  4768. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
  4769. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
  4770. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
  4771. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
  4772. }
  4773. } break;
  4774. case LLM_ARCH_HUNYUAN_MOE:
  4775. {
  4776. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4777. // output
  4778. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4779. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4780. // if output is NULL, init from the input tok embed
  4781. if (output == NULL) {
  4782. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4783. }
  4784. for (int i = 0; i < n_layer; ++i) {
  4785. auto & layer = layers[i];
  4786. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4787. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4788. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4789. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4790. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4791. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4792. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4793. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4794. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4795. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  4796. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  4797. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  4798. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  4799. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  4800. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
  4801. }
  4802. } break;
  4803. case LLM_ARCH_HUNYUAN_DENSE:
  4804. {
  4805. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4806. // output
  4807. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4808. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4809. // if output is NULL, init from the input tok embed
  4810. if (output == NULL) {
  4811. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4812. }
  4813. for (int i = 0; i < n_layer; ++i) {
  4814. auto & layer = layers[i];
  4815. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4816. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4817. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4818. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4819. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4820. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4821. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4822. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4823. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4824. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4825. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4826. }
  4827. } break;
  4828. case LLM_ARCH_SMOLLM3:
  4829. {
  4830. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4831. // output
  4832. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4833. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4834. // if output is NULL, init from the input tok embed
  4835. if (output == NULL) {
  4836. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4837. }
  4838. for (int i = 0; i < n_layer; ++i) {
  4839. auto & layer = layers[i];
  4840. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4841. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4842. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4843. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4844. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4845. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4846. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4847. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4848. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4849. }
  4850. } break;
  4851. case LLM_ARCH_OPENAI_MOE:
  4852. {
  4853. const int64_t n_ff_exp = hparams.n_ff_exp;
  4854. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4855. // output
  4856. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4857. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4858. for (int i = 0; i < n_layer; ++i) {
  4859. auto & layer = layers[i];
  4860. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4861. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  4862. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
  4863. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  4864. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  4865. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
  4866. layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
  4867. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
  4868. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4869. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  4870. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4871. // bias
  4872. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_head * n_rot}, 0);
  4873. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_head_kv * n_rot}, 0);
  4874. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_head_kv * n_rot}, 0);
  4875. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  4876. layer.ffn_gate_inp_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0);
  4877. layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
  4878. layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), { n_embd, n_expert}, 0);
  4879. layer.ffn_up_exps_b = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
  4880. }
  4881. } break;
  4882. case LLM_ARCH_LFM2:
  4883. {
  4884. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4885. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  4886. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4887. if (output == NULL) {
  4888. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4889. }
  4890. for (int i = 0; i < n_layer; ++i) {
  4891. auto & layer = layers[i];
  4892. // ffn is same for transformer and conv layers
  4893. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4894. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4895. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4896. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4897. // for operator_norm
  4898. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4899. if (!hparams.is_recurrent(i)) {
  4900. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4901. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4902. GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
  4903. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  4904. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, hparams.n_embd_k_gqa(i)}, 0);
  4905. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, hparams.n_embd_v_gqa(i)}, 0);
  4906. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4907. } else {
  4908. layer.shortconv.conv = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV, "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0);
  4909. layer.shortconv.in_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ, "weight", i), {n_embd, 3 * n_embd}, 0);
  4910. layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
  4911. }
  4912. }
  4913. } break;
  4914. case LLM_ARCH_SMALLTHINKER:
  4915. {
  4916. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  4917. // output
  4918. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  4919. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4920. // if output is NULL, init from the input tok embed
  4921. if (output == NULL) {
  4922. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4923. }
  4924. for (int i = 0; i < n_layer; ++i) {
  4925. auto & layer = layers[i];
  4926. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  4927. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
  4928. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
  4929. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
  4930. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
  4931. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  4932. GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for SMALLTHINKER");
  4933. GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for SMALLTHINKER");
  4934. // MoE branch
  4935. const int64_t n_ff_exp = hparams.n_ff_exp;
  4936. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
  4937. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
  4938. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
  4939. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
  4940. }
  4941. } break;
  4942. default:
  4943. throw std::runtime_error("unknown architecture");
  4944. }
  4945. if (n_moved_tensors > 0) {
  4946. LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
  4947. __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
  4948. ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
  4949. }
  4950. }
  4951. ml.done_getting_tensors();
  4952. ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
  4953. pimpl->mappings.reserve(ml.mappings.size());
  4954. // create the backend buffers
  4955. std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
  4956. ctx_bufs.reserve(ctx_map.size());
  4957. // Ensure we have enough capacity for the maximum backend buffer we will potentially create
  4958. const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
  4959. pimpl->bufs.reserve(n_max_backend_buffer);
  4960. for (auto & it : ctx_map) {
  4961. ggml_backend_buffer_type_t buft = it.first;
  4962. ggml_context * ctx = it.second;
  4963. // skip contexts without tensors
  4964. if (ggml_get_first_tensor(ctx) == nullptr) {
  4965. continue;
  4966. }
  4967. llama_buf_map buf_map;
  4968. buf_map.reserve(n_max_backend_buffer);
  4969. // check if it is possible to use buffer_from_host_ptr with this buffer type
  4970. ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
  4971. if (!dev) {
  4972. // FIXME: workaround for CPU backend buft having a NULL device
  4973. dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  4974. if (!dev) {
  4975. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  4976. }
  4977. }
  4978. ggml_backend_dev_props props;
  4979. ggml_backend_dev_get_props(dev, &props);
  4980. bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
  4981. bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
  4982. if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
  4983. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  4984. // only the mmap region containing the tensors in the model is mapped to the backend buffer
  4985. // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
  4986. // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
  4987. void * addr = nullptr;
  4988. size_t first, last; // NOLINT
  4989. ml.get_mapping_range(&first, &last, &addr, idx, ctx);
  4990. if (first >= last) {
  4991. continue;
  4992. }
  4993. const size_t max_size = ggml_get_max_tensor_size(ctx);
  4994. ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
  4995. if (buf == nullptr) {
  4996. throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
  4997. }
  4998. pimpl->bufs.emplace_back(buf);
  4999. buf_map.emplace(idx, buf);
  5000. }
  5001. }
  5002. else {
  5003. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
  5004. if (buf == nullptr) {
  5005. throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
  5006. }
  5007. pimpl->bufs.emplace_back(buf);
  5008. if (use_mlock && ggml_backend_buffer_is_host(buf)) {
  5009. pimpl->mlock_bufs.emplace_back(new llama_mlock);
  5010. auto & mlock_buf = pimpl->mlock_bufs.back();
  5011. mlock_buf->init (ggml_backend_buffer_get_base(buf));
  5012. mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
  5013. }
  5014. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  5015. buf_map.emplace(idx, buf);
  5016. }
  5017. }
  5018. if (pimpl->bufs.empty()) {
  5019. throw std::runtime_error("failed to allocate buffer");
  5020. }
  5021. for (auto & buf : buf_map) {
  5022. // indicate that this buffer contains weights
  5023. // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
  5024. ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
  5025. }
  5026. ctx_bufs.emplace_back(ctx, buf_map);
  5027. }
  5028. if (llama_supports_gpu_offload()) {
  5029. const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
  5030. LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
  5031. if (n_gpu_layers > (int) hparams.n_layer) {
  5032. LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
  5033. }
  5034. const int max_backend_supported_layers = hparams.n_layer + 1;
  5035. const int max_offloadable_layers = hparams.n_layer + 1;
  5036. LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
  5037. }
  5038. // print memory requirements per buffer type
  5039. for (auto & buf : pimpl->bufs) {
  5040. LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
  5041. }
  5042. // populate tensors_by_name
  5043. for (auto & ctx : pimpl->ctxs) {
  5044. for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
  5045. tensors_by_name.emplace_back(ggml_get_name(cur), cur);
  5046. }
  5047. }
  5048. // load tensor data
  5049. for (auto & it : ctx_bufs) {
  5050. ggml_context * ctx = it.first;
  5051. auto & bufs = it.second;
  5052. if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
  5053. return false;
  5054. }
  5055. }
  5056. if (use_mmap_buffer) {
  5057. for (auto & mapping : ml.mappings) {
  5058. pimpl->mappings.emplace_back(std::move(mapping));
  5059. }
  5060. }
  5061. return true;
  5062. }
  5063. std::string llama_model::arch_name() const {
  5064. return llm_arch_name(arch);
  5065. }
  5066. std::string llama_model::type_name() const {
  5067. return llm_type_name(type);
  5068. }
  5069. std::string llama_model::desc() const {
  5070. return pimpl->desc_str;
  5071. }
  5072. size_t llama_model::size() const {
  5073. return pimpl->n_bytes;
  5074. }
  5075. size_t llama_model::n_tensors() const {
  5076. return tensors_by_name.size();
  5077. }
  5078. size_t llama_model::n_devices() const {
  5079. return devices.size();
  5080. }
  5081. uint64_t llama_model::n_elements() const {
  5082. return pimpl->n_elements;
  5083. }
  5084. void llama_model::print_info() const {
  5085. const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
  5086. auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
  5087. bool is_var = false;
  5088. std::vector<uint32_t> v;
  5089. for (uint32_t i = 0; i < n; ++i) {
  5090. v.push_back(f(i));
  5091. if (v[i] != v[0]) {
  5092. is_var = true;
  5093. }
  5094. }
  5095. std::stringstream ss;
  5096. if (is_var) {
  5097. ss << "[";
  5098. for (uint32_t i = 0; i < n; ++i) {
  5099. ss << v[i];
  5100. if (i < n - 1) {
  5101. ss << ", ";
  5102. }
  5103. }
  5104. ss << "]";
  5105. } else {
  5106. ss << v[0];
  5107. }
  5108. return ss.str();
  5109. };
  5110. // hparams
  5111. LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
  5112. LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
  5113. if (!hparams.vocab_only) {
  5114. LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
  5115. LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
  5116. LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
  5117. LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
  5118. LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
  5119. LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
  5120. LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
  5121. LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
  5122. LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
  5123. LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
  5124. LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
  5125. LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
  5126. LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
  5127. LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
  5128. LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
  5129. LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
  5130. LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
  5131. LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
  5132. LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
  5133. LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
  5134. LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
  5135. LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
  5136. LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
  5137. LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
  5138. LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
  5139. LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
  5140. LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
  5141. LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
  5142. LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
  5143. LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
  5144. if (!classifier_labels.empty()) {
  5145. LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
  5146. size_t i = 0;
  5147. for (auto label : classifier_labels) {
  5148. LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
  5149. }
  5150. }
  5151. }
  5152. if (arch == LLM_ARCH_MAMBA ||
  5153. arch == LLM_ARCH_MAMBA2 ||
  5154. arch == LLM_ARCH_JAMBA ||
  5155. arch == LLM_ARCH_FALCON_H1 ||
  5156. arch == LLM_ARCH_PLAMO2 ||
  5157. arch == LLM_ARCH_GRANITE_HYBRID ||
  5158. arch == LLM_ARCH_NEMOTRON_H ||
  5159. arch == LLM_ARCH_QWEN3NEXT) {
  5160. LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
  5161. LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
  5162. LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
  5163. LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
  5164. LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
  5165. LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
  5166. }
  5167. LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
  5168. if (pimpl->n_elements >= 1e12) {
  5169. LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
  5170. } else if (pimpl->n_elements >= 1e9) {
  5171. LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
  5172. } else if (pimpl->n_elements >= 1e6) {
  5173. LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
  5174. } else {
  5175. LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
  5176. }
  5177. // general kv
  5178. LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
  5179. if (arch == LLM_ARCH_DEEPSEEK) {
  5180. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  5181. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5182. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  5183. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  5184. }
  5185. if (arch == LLM_ARCH_DEEPSEEK2) {
  5186. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  5187. LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
  5188. LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
  5189. LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
  5190. LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
  5191. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5192. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  5193. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  5194. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  5195. LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
  5196. LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
  5197. }
  5198. if (arch == LLM_ARCH_QWEN2MOE) {
  5199. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5200. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  5201. }
  5202. if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE) {
  5203. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5204. }
  5205. if (arch == LLM_ARCH_MINICPM ||
  5206. arch == LLM_ARCH_GRANITE ||
  5207. arch == LLM_ARCH_GRANITE_MOE ||
  5208. arch == LLM_ARCH_GRANITE_HYBRID) {
  5209. LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
  5210. LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
  5211. LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
  5212. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  5213. }
  5214. if (arch == LLM_ARCH_BAILINGMOE) {
  5215. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  5216. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5217. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  5218. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  5219. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  5220. }
  5221. if (arch == LLM_ARCH_SMALLTHINKER) {
  5222. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5223. LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
  5224. }
  5225. vocab.print_info();
  5226. }
  5227. ggml_backend_dev_t llama_model::dev_layer(int il) const {
  5228. return pimpl->dev_layer.at(il).dev;
  5229. }
  5230. ggml_backend_dev_t llama_model::dev_output() const {
  5231. return pimpl->dev_output.dev;
  5232. }
  5233. template<typename F>
  5234. static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
  5235. ggml_init_params params = {
  5236. /*.mem_size =*/ ggml_tensor_overhead()*8,
  5237. /*.mem_buffer =*/ NULL,
  5238. /*.no_alloc =*/ true,
  5239. };
  5240. ggml_context_ptr ctx { ggml_init(params) };
  5241. if (!ctx) {
  5242. throw std::runtime_error(format("failed to create ggml context"));
  5243. }
  5244. ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
  5245. ggml_tensor * op_tensor = fn(ctx.get());
  5246. for (int i = 0; i < GGML_MAX_SRC; i++) {
  5247. if (op_tensor->src[i] != nullptr) {
  5248. assert(op_tensor->src[i]->buffer == nullptr);
  5249. op_tensor->src[i]->buffer = buf.get();
  5250. }
  5251. }
  5252. bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  5253. return op_supported;
  5254. }
  5255. template<typename F>
  5256. static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
  5257. for (const auto & cur : buft_list) {
  5258. ggml_backend_dev_t cur_dev = cur.first;
  5259. ggml_backend_buffer_type_t cur_buft = cur.second;
  5260. if (buft_supported(cur_buft, cur_dev, fn)) {
  5261. return cur_buft;
  5262. }
  5263. }
  5264. throw std::runtime_error(format("no suitable buffer type found"));
  5265. }
  5266. ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
  5267. return ::select_buft(
  5268. *pimpl->dev_layer.at(il).buft_list,
  5269. [&](ggml_context * ctx) {
  5270. ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  5271. ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  5272. return ggml_add(ctx, cur, layer_dir);
  5273. });
  5274. }
  5275. bool llama_model::has_tensor_overrides() const {
  5276. return pimpl->has_tensor_overrides;
  5277. }
  5278. const ggml_tensor * llama_model::get_tensor(const char * name) const {
  5279. auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
  5280. [name](const std::pair<std::string, ggml_tensor *> & it) {
  5281. return it.first == name;
  5282. });
  5283. if (it == tensors_by_name.end()) {
  5284. return nullptr;
  5285. }
  5286. return it->second;
  5287. }
  5288. float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
  5289. return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
  5290. }
  5291. float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
  5292. return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
  5293. }
  5294. ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
  5295. const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
  5296. // choose long/short freq factors based on the context size
  5297. if (layers[il].rope_freqs != nullptr) {
  5298. return layers[il].rope_freqs;
  5299. }
  5300. if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
  5301. return layers[il].rope_long;
  5302. }
  5303. return layers[il].rope_short;
  5304. }
  5305. struct llm_build_llama : public llm_graph_context {
  5306. llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5307. const int64_t n_embd_head = hparams.n_embd_head_v;
  5308. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5309. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5310. ggml_tensor * cur;
  5311. ggml_tensor * inpL;
  5312. inpL = build_inp_embd(model.tok_embd);
  5313. // inp_pos - contains the positions
  5314. ggml_tensor * inp_pos = build_inp_pos();
  5315. auto * inp_attn = build_attn_inp_kv();
  5316. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  5317. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5318. for (int il = 0; il < n_layer; ++il) {
  5319. ggml_tensor * inpSA = inpL;
  5320. // norm
  5321. cur = build_norm(inpL,
  5322. model.layers[il].attn_norm, NULL,
  5323. LLM_NORM_RMS, il);
  5324. cb(cur, "attn_norm", il);
  5325. // self-attention
  5326. {
  5327. // rope freq factors for llama3; may return nullptr for llama2 and other models
  5328. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  5329. // compute Q and K and RoPE them
  5330. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5331. cb(Qcur, "Qcur", il);
  5332. if (model.layers[il].bq) {
  5333. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  5334. cb(Qcur, "Qcur", il);
  5335. }
  5336. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5337. cb(Kcur, "Kcur", il);
  5338. if (model.layers[il].bk) {
  5339. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  5340. cb(Kcur, "Kcur", il);
  5341. }
  5342. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5343. cb(Vcur, "Vcur", il);
  5344. if (model.layers[il].bv) {
  5345. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  5346. cb(Vcur, "Vcur", il);
  5347. }
  5348. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5349. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5350. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5351. Qcur = ggml_rope_ext(
  5352. ctx0, Qcur, inp_pos, rope_factors,
  5353. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5354. ext_factor, attn_factor, beta_fast, beta_slow
  5355. );
  5356. Kcur = ggml_rope_ext(
  5357. ctx0, Kcur, inp_pos, rope_factors,
  5358. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5359. ext_factor, attn_factor, beta_fast, beta_slow
  5360. );
  5361. cb(Qcur, "Qcur", il);
  5362. cb(Kcur, "Kcur", il);
  5363. cb(Vcur, "Vcur", il);
  5364. if (hparams.use_kq_norm) {
  5365. // Llama4TextL2Norm
  5366. Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
  5367. Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
  5368. cb(Qcur, "Qcur_normed", il);
  5369. cb(Kcur, "Kcur_normed", il);
  5370. }
  5371. cur = build_attn(inp_attn,
  5372. model.layers[il].wo, model.layers[il].bo,
  5373. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  5374. cb(cur, "attn_out", il);
  5375. }
  5376. if (il == n_layer - 1 && inp_out_ids) {
  5377. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5378. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5379. }
  5380. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5381. cb(ffn_inp, "ffn_inp", il);
  5382. // feed-forward network (non-MoE)
  5383. if (model.layers[il].ffn_gate_inp == nullptr) {
  5384. cur = build_norm(ffn_inp,
  5385. model.layers[il].ffn_norm, NULL,
  5386. LLM_NORM_RMS, il);
  5387. cb(cur, "ffn_norm", il);
  5388. cur = build_ffn(cur,
  5389. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  5390. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  5391. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  5392. NULL,
  5393. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5394. cb(cur, "ffn_out", il);
  5395. } else {
  5396. // MoE branch
  5397. cur = build_norm(ffn_inp,
  5398. model.layers[il].ffn_norm, NULL,
  5399. LLM_NORM_RMS, il);
  5400. cb(cur, "ffn_norm", il);
  5401. cur = build_moe_ffn(cur,
  5402. model.layers[il].ffn_gate_inp,
  5403. model.layers[il].ffn_up_exps,
  5404. model.layers[il].ffn_gate_exps,
  5405. model.layers[il].ffn_down_exps,
  5406. nullptr,
  5407. n_expert, n_expert_used,
  5408. LLM_FFN_SILU, true,
  5409. false, 0.0,
  5410. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  5411. il);
  5412. cb(cur, "ffn_moe_out", il);
  5413. }
  5414. cur = ggml_add(ctx0, cur, ffn_inp);
  5415. cb(cur, "ffn_out", il);
  5416. cur = build_cvec(cur, il);
  5417. cb(cur, "l_out", il);
  5418. // input for next layer
  5419. inpL = cur;
  5420. }
  5421. cur = inpL;
  5422. cur = build_norm(cur,
  5423. model.output_norm, NULL,
  5424. LLM_NORM_RMS, -1);
  5425. cb(cur, "result_norm", -1);
  5426. res->t_embd = cur;
  5427. // lm_head
  5428. cur = build_lora_mm(model.output, cur);
  5429. cb(cur, "result_output", -1);
  5430. res->t_logits = cur;
  5431. ggml_build_forward_expand(gf, cur);
  5432. }
  5433. };
  5434. struct llm_build_llama_iswa : public llm_graph_context {
  5435. llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5436. const int64_t n_embd_head = hparams.n_embd_head_v;
  5437. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5438. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5439. ggml_tensor * cur;
  5440. ggml_tensor * inpL;
  5441. inpL = build_inp_embd(model.tok_embd);
  5442. // inp_pos - contains the positions
  5443. ggml_tensor * inp_pos = build_inp_pos();
  5444. // temperature tuning
  5445. ggml_tensor * inp_attn_scale = nullptr;
  5446. inp_attn_scale = build_inp_attn_scale();
  5447. auto * inp_attn = build_attn_inp_kv_iswa();
  5448. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  5449. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5450. for (int il = 0; il < n_layer; ++il) {
  5451. ggml_tensor * inpSA = inpL;
  5452. const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
  5453. (il + 1) % hparams.n_no_rope_layer_step != 0;
  5454. // norm
  5455. cur = build_norm(inpL,
  5456. model.layers[il].attn_norm, NULL,
  5457. LLM_NORM_RMS, il);
  5458. cb(cur, "attn_norm", il);
  5459. // self-attention
  5460. {
  5461. // rope freq factors for llama3; may return nullptr for llama2 and other models
  5462. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  5463. // compute Q and K and RoPE them
  5464. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5465. cb(Qcur, "Qcur", il);
  5466. if (model.layers[il].bq) {
  5467. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  5468. cb(Qcur, "Qcur", il);
  5469. }
  5470. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5471. cb(Kcur, "Kcur", il);
  5472. if (model.layers[il].bk) {
  5473. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  5474. cb(Kcur, "Kcur", il);
  5475. }
  5476. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5477. cb(Vcur, "Vcur", il);
  5478. if (model.layers[il].bv) {
  5479. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  5480. cb(Vcur, "Vcur", il);
  5481. }
  5482. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5483. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5484. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5485. if (use_rope) {
  5486. Qcur = ggml_rope_ext(
  5487. ctx0, Qcur, inp_pos, rope_factors,
  5488. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5489. ext_factor, attn_factor, beta_fast, beta_slow
  5490. );
  5491. Kcur = ggml_rope_ext(
  5492. ctx0, Kcur, inp_pos, rope_factors,
  5493. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5494. ext_factor, attn_factor, beta_fast, beta_slow
  5495. );
  5496. } else if (inp_attn_scale) {
  5497. Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
  5498. }
  5499. cb(Qcur, "Qcur", il);
  5500. cb(Kcur, "Kcur", il);
  5501. cb(Vcur, "Vcur", il);
  5502. if (use_rope && hparams.use_kq_norm) {
  5503. // Llama4TextL2Norm
  5504. Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
  5505. Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
  5506. cb(Qcur, "Qcur_normed", il);
  5507. cb(Kcur, "Kcur_normed", il);
  5508. }
  5509. cur = build_attn(inp_attn,
  5510. model.layers[il].wo, model.layers[il].bo,
  5511. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  5512. cb(cur, "attn_out", il);
  5513. }
  5514. if (il == n_layer - 1 && inp_out_ids) {
  5515. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5516. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5517. }
  5518. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5519. cb(ffn_inp, "ffn_inp", il);
  5520. // feed-forward network (non-MoE)
  5521. if (model.layers[il].ffn_gate_inp == nullptr) {
  5522. cur = build_norm(ffn_inp,
  5523. model.layers[il].ffn_norm, NULL,
  5524. LLM_NORM_RMS, il);
  5525. cb(cur, "ffn_norm", il);
  5526. cur = build_ffn(cur,
  5527. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  5528. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  5529. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  5530. NULL,
  5531. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5532. cb(cur, "ffn_out", il);
  5533. } else {
  5534. ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
  5535. model.layers[il].ffn_norm, NULL,
  5536. LLM_NORM_RMS, il);
  5537. cb(cur, "ffn_norm", il);
  5538. ggml_tensor * moe_out = build_moe_ffn(ffn_inp_normed,
  5539. model.layers[il].ffn_gate_inp,
  5540. model.layers[il].ffn_up_exps,
  5541. model.layers[il].ffn_gate_exps,
  5542. model.layers[il].ffn_down_exps,
  5543. nullptr,
  5544. n_expert, n_expert_used,
  5545. LLM_FFN_SILU, false,
  5546. false, 0.0,
  5547. LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
  5548. il);
  5549. // Shared experts
  5550. ggml_tensor * shexp_out = build_ffn(ffn_inp_normed,
  5551. model.layers[il].ffn_up_shexp, NULL, NULL,
  5552. model.layers[il].ffn_gate_shexp, NULL, NULL,
  5553. model.layers[il].ffn_down_shexp, NULL, NULL,
  5554. NULL,
  5555. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5556. cb(shexp_out, "ffn_moe_shexp", il);
  5557. cur = ggml_add(ctx0, moe_out, shexp_out);
  5558. cb(cur, "ffn_moe_out_merged", il);
  5559. }
  5560. cur = ggml_add(ctx0, cur, ffn_inp);
  5561. cb(cur, "ffn_out", il);
  5562. cur = build_cvec(cur, il);
  5563. cb(cur, "l_out", il);
  5564. // input for next layer
  5565. inpL = cur;
  5566. }
  5567. cur = inpL;
  5568. cur = build_norm(cur,
  5569. model.output_norm, NULL,
  5570. LLM_NORM_RMS, -1);
  5571. cb(cur, "result_norm", -1);
  5572. res->t_embd = cur;
  5573. // lm_head
  5574. cur = build_lora_mm(model.output, cur);
  5575. cb(cur, "result_output", -1);
  5576. res->t_logits = cur;
  5577. ggml_build_forward_expand(gf, cur);
  5578. }
  5579. };
  5580. struct llm_build_deci : public llm_graph_context {
  5581. llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5582. const int64_t n_embd_head = hparams.n_embd_head_v;
  5583. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5584. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5585. ggml_tensor * cur;
  5586. ggml_tensor * inpL;
  5587. inpL = build_inp_embd(model.tok_embd);
  5588. // inp_pos - contains the positions
  5589. ggml_tensor * inp_pos = build_inp_pos();
  5590. auto * inp_attn = build_attn_inp_kv();
  5591. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  5592. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5593. for (int il = 0; il < n_layer; ++il) {
  5594. ggml_tensor * inpSA = inpL;
  5595. const int64_t n_head_kv = hparams.n_head_kv(il);
  5596. const int64_t n_head = hparams.n_head(il);
  5597. const int64_t n_ff = hparams.n_ff(il);
  5598. if (n_head == 0) {
  5599. // attention-free layer of Llama-3_1-Nemotron-51B
  5600. cur = inpL;
  5601. } else {
  5602. // norm
  5603. cur = build_norm(inpL,
  5604. model.layers[il].attn_norm, NULL,
  5605. LLM_NORM_RMS, il);
  5606. cb(cur, "attn_norm", il);
  5607. }
  5608. if (n_head > 0 && n_head_kv == 0) {
  5609. // "linear attention" of Llama-3_1-Nemotron-51B
  5610. cur = build_lora_mm(model.layers[il].wo, cur);
  5611. cb(cur, "wo", il);
  5612. } else if (n_head > 0) {
  5613. // self-attention
  5614. // rope freq factors for llama3; may return nullptr for llama2 and other models
  5615. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  5616. // compute Q and K and RoPE them
  5617. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5618. cb(Qcur, "Qcur", il);
  5619. if (model.layers[il].bq) {
  5620. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  5621. cb(Qcur, "Qcur", il);
  5622. }
  5623. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5624. cb(Kcur, "Kcur", il);
  5625. if (model.layers[il].bk) {
  5626. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  5627. cb(Kcur, "Kcur", il);
  5628. }
  5629. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5630. cb(Vcur, "Vcur", il);
  5631. if (model.layers[il].bv) {
  5632. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  5633. cb(Vcur, "Vcur", il);
  5634. }
  5635. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5636. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5637. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5638. Qcur = ggml_rope_ext(
  5639. ctx0, Qcur, inp_pos, rope_factors,
  5640. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5641. ext_factor, attn_factor, beta_fast, beta_slow
  5642. );
  5643. Kcur = ggml_rope_ext(
  5644. ctx0, Kcur, inp_pos, rope_factors,
  5645. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5646. ext_factor, attn_factor, beta_fast, beta_slow
  5647. );
  5648. cb(Qcur, "Qcur", il);
  5649. cb(Kcur, "Kcur", il);
  5650. cb(Vcur, "Vcur", il);
  5651. cur = build_attn(inp_attn,
  5652. model.layers[il].wo, model.layers[il].bo,
  5653. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  5654. }
  5655. if (il == n_layer - 1 && inp_out_ids) {
  5656. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5657. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5658. }
  5659. // FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
  5660. if (n_ff == 0) {
  5661. continue;
  5662. }
  5663. // modified to support attention-free layer of Llama-3_1-Nemotron-51B
  5664. ggml_tensor * ffn_inp = cur;
  5665. if (n_head > 0) {
  5666. ffn_inp = ggml_add(ctx0, cur, inpSA);
  5667. cb(ffn_inp, "ffn_inp", il);
  5668. }
  5669. // feed-forward network
  5670. if (model.layers[il].ffn_gate_inp == nullptr) {
  5671. cur = build_norm(ffn_inp,
  5672. model.layers[il].ffn_norm, NULL,
  5673. LLM_NORM_RMS, il);
  5674. cb(cur, "ffn_norm", il);
  5675. cur = build_ffn(cur,
  5676. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  5677. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  5678. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  5679. NULL,
  5680. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5681. cb(cur, "ffn_out", il);
  5682. }
  5683. cur = ggml_add(ctx0, cur, ffn_inp);
  5684. cb(cur, "ffn_out", il);
  5685. cur = build_cvec(cur, il);
  5686. cb(cur, "l_out", il);
  5687. // input for next layer
  5688. inpL = cur;
  5689. }
  5690. cur = inpL;
  5691. cur = build_norm(cur,
  5692. model.output_norm, NULL,
  5693. LLM_NORM_RMS, -1);
  5694. cb(cur, "result_norm", -1);
  5695. res->t_embd = cur;
  5696. // lm_head
  5697. cur = build_lora_mm(model.output, cur);
  5698. cb(cur, "result_output", -1);
  5699. res->t_logits = cur;
  5700. ggml_build_forward_expand(gf, cur);
  5701. }
  5702. };
  5703. struct llm_build_baichuan : public llm_graph_context {
  5704. llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5705. const int64_t n_embd_head = hparams.n_embd_head_v;
  5706. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5707. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5708. ggml_tensor * cur;
  5709. ggml_tensor * inpL;
  5710. inpL = build_inp_embd(model.tok_embd);
  5711. // inp_pos - contains the positions
  5712. ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
  5713. auto * inp_attn = build_attn_inp_kv();
  5714. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5715. for (int il = 0; il < n_layer; ++il) {
  5716. ggml_tensor * inpSA = inpL;
  5717. cur = build_norm(inpL,
  5718. model.layers[il].attn_norm, NULL,
  5719. LLM_NORM_RMS, il);
  5720. cb(cur, "attn_norm", il);
  5721. // self-attention
  5722. {
  5723. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5724. cb(Qcur, "Qcur", il);
  5725. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5726. cb(Kcur, "Kcur", il);
  5727. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5728. cb(Vcur, "Vcur", il);
  5729. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5730. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5731. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5732. switch (model.type) {
  5733. case LLM_TYPE_7B:
  5734. Qcur = ggml_rope_ext(
  5735. ctx0, Qcur, inp_pos, nullptr,
  5736. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5737. ext_factor, attn_factor, beta_fast, beta_slow
  5738. );
  5739. Kcur = ggml_rope_ext(
  5740. ctx0, Kcur, inp_pos, nullptr,
  5741. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5742. ext_factor, attn_factor, beta_fast, beta_slow
  5743. );
  5744. break;
  5745. case LLM_TYPE_13B:
  5746. break;
  5747. default:
  5748. GGML_ABORT("fatal error");
  5749. }
  5750. cb(Qcur, "Qcur", il);
  5751. cb(Kcur, "Kcur", il);
  5752. cb(Vcur, "Vcur", il);
  5753. cur = build_attn(inp_attn,
  5754. model.layers[il].wo, NULL,
  5755. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5756. }
  5757. if (il == n_layer - 1 && inp_out_ids) {
  5758. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5759. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5760. }
  5761. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5762. cb(ffn_inp, "ffn_inp", il);
  5763. // feed-forward network
  5764. {
  5765. cur = build_norm(ffn_inp,
  5766. model.layers[il].ffn_norm, NULL,
  5767. LLM_NORM_RMS, il);
  5768. cb(cur, "ffn_norm", il);
  5769. cur = build_ffn(cur,
  5770. model.layers[il].ffn_up, NULL, NULL,
  5771. model.layers[il].ffn_gate, NULL, NULL,
  5772. model.layers[il].ffn_down, NULL, NULL,
  5773. NULL,
  5774. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5775. cb(cur, "ffn_out", il);
  5776. }
  5777. cur = ggml_add(ctx0, cur, ffn_inp);
  5778. cur = build_cvec(cur, il);
  5779. cb(cur, "l_out", il);
  5780. // input for next layer
  5781. inpL = cur;
  5782. }
  5783. cur = inpL;
  5784. cur = build_norm(cur,
  5785. model.output_norm, NULL,
  5786. LLM_NORM_RMS, -1);
  5787. cb(cur, "result_norm", -1);
  5788. res->t_embd = cur;
  5789. // lm_head
  5790. cur = build_lora_mm(model.output, cur);
  5791. cb(cur, "result_output", -1);
  5792. res->t_logits = cur;
  5793. ggml_build_forward_expand(gf, cur);
  5794. }
  5795. };
  5796. struct llm_build_xverse : public llm_graph_context {
  5797. llm_build_xverse(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5798. const int64_t n_embd_head = hparams.n_embd_head_v;
  5799. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5800. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5801. ggml_tensor * cur;
  5802. ggml_tensor * inpL;
  5803. inpL = build_inp_embd(model.tok_embd);
  5804. // inp_pos - contains the positions
  5805. ggml_tensor * inp_pos = build_inp_pos();
  5806. auto * inp_attn = build_attn_inp_kv();
  5807. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5808. for (int il = 0; il < n_layer; ++il) {
  5809. ggml_tensor * inpSA = inpL;
  5810. cur = build_norm(inpL,
  5811. model.layers[il].attn_norm, NULL,
  5812. LLM_NORM_RMS, il);
  5813. cb(cur, "attn_norm", il);
  5814. // self-attention
  5815. {
  5816. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5817. cb(Qcur, "Qcur", il);
  5818. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5819. cb(Kcur, "Kcur", il);
  5820. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5821. cb(Vcur, "Vcur", il);
  5822. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5823. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5824. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5825. Qcur = ggml_rope_ext(
  5826. ctx0, Qcur, inp_pos, nullptr,
  5827. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5828. ext_factor, attn_factor, beta_fast, beta_slow
  5829. );
  5830. Kcur = ggml_rope_ext(
  5831. ctx0, Kcur, inp_pos, nullptr,
  5832. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5833. ext_factor, attn_factor, beta_fast, beta_slow
  5834. );
  5835. cb(Qcur, "Qcur", il);
  5836. cb(Kcur, "Kcur", il);
  5837. cb(Vcur, "Vcur", il);
  5838. cur = build_attn(inp_attn,
  5839. model.layers[il].wo, NULL,
  5840. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5841. }
  5842. if (il == n_layer - 1 && inp_out_ids) {
  5843. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5844. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5845. }
  5846. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5847. cb(ffn_inp, "ffn_inp", il);
  5848. // feed-forward network
  5849. {
  5850. cur = build_norm(ffn_inp,
  5851. model.layers[il].ffn_norm, NULL,
  5852. LLM_NORM_RMS, il);
  5853. cb(cur, "ffn_norm", il);
  5854. cur = build_ffn(cur,
  5855. model.layers[il].ffn_up, NULL, NULL,
  5856. model.layers[il].ffn_gate, NULL, NULL,
  5857. model.layers[il].ffn_down, NULL, NULL,
  5858. NULL,
  5859. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5860. cb(cur, "ffn_out", il);
  5861. }
  5862. cur = ggml_add(ctx0, cur, ffn_inp);
  5863. cur = build_cvec(cur, il);
  5864. cb(cur, "l_out", il);
  5865. // input for next layer
  5866. inpL = cur;
  5867. }
  5868. cur = inpL;
  5869. cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
  5870. cb(cur, "result_norm", -1);
  5871. res->t_embd = cur;
  5872. // lm_head
  5873. cur = build_lora_mm(model.output, cur);
  5874. cb(cur, "result_output", -1);
  5875. res->t_logits = cur;
  5876. ggml_build_forward_expand(gf, cur);
  5877. }
  5878. };
  5879. struct llm_build_falcon : public llm_graph_context {
  5880. llm_build_falcon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5881. const int64_t n_embd_head = hparams.n_embd_head_v;
  5882. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  5883. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5884. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5885. ggml_tensor * cur;
  5886. ggml_tensor * inpL;
  5887. inpL = build_inp_embd(model.tok_embd);
  5888. // inp_pos - contains the positions
  5889. ggml_tensor * inp_pos = build_inp_pos();
  5890. auto * inp_attn = build_attn_inp_kv();
  5891. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5892. for (int il = 0; il < n_layer; ++il) {
  5893. ggml_tensor * attn_norm;
  5894. attn_norm = build_norm(inpL,
  5895. model.layers[il].attn_norm,
  5896. model.layers[il].attn_norm_b,
  5897. LLM_NORM, il);
  5898. cb(attn_norm, "attn_norm", il);
  5899. // self-attention
  5900. {
  5901. if (model.layers[il].attn_norm_2) {
  5902. // Falcon-40B
  5903. cur = build_norm(inpL,
  5904. model.layers[il].attn_norm_2,
  5905. model.layers[il].attn_norm_2_b,
  5906. LLM_NORM, il);
  5907. cb(cur, "attn_norm_2", il);
  5908. } else {
  5909. cur = attn_norm;
  5910. }
  5911. cur = build_lora_mm(model.layers[il].wqkv, cur);
  5912. cb(cur, "wqkv", il);
  5913. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  5914. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  5915. ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  5916. // using mode = 2 for neox mode
  5917. Qcur = ggml_rope_ext(
  5918. ctx0, Qcur, inp_pos, nullptr,
  5919. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5920. ext_factor, attn_factor, beta_fast, beta_slow
  5921. );
  5922. Kcur = ggml_rope_ext(
  5923. ctx0, Kcur, inp_pos, nullptr,
  5924. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5925. ext_factor, attn_factor, beta_fast, beta_slow
  5926. );
  5927. cb(Qcur, "Qcur", il);
  5928. cb(Kcur, "Kcur", il);
  5929. cb(Vcur, "Vcur", il);
  5930. cur = build_attn(inp_attn,
  5931. model.layers[il].wo, NULL,
  5932. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5933. }
  5934. if (il == n_layer - 1 && inp_out_ids) {
  5935. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5936. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  5937. attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
  5938. }
  5939. ggml_tensor * ffn_inp = cur;
  5940. // feed forward
  5941. {
  5942. cur = build_ffn(attn_norm, // !! use the attn norm, not the result
  5943. model.layers[il].ffn_up, NULL, NULL,
  5944. NULL, NULL, NULL,
  5945. model.layers[il].ffn_down, NULL, NULL,
  5946. NULL,
  5947. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  5948. cb(cur, "ffn_out", il);
  5949. }
  5950. cur = ggml_add(ctx0, cur, ffn_inp);
  5951. cur = ggml_add(ctx0, cur, inpL);
  5952. cur = build_cvec(cur, il);
  5953. cb(cur, "l_out", il);
  5954. // input for next layer
  5955. inpL = cur;
  5956. }
  5957. cur = inpL;
  5958. // norm
  5959. cur = build_norm(cur,
  5960. model.output_norm,
  5961. model.output_norm_b,
  5962. LLM_NORM, -1);
  5963. cb(cur, "result_norm", -1);
  5964. res->t_embd = cur;
  5965. cur = build_lora_mm(model.output, cur);
  5966. cb(cur, "result_output", -1);
  5967. res->t_logits = cur;
  5968. ggml_build_forward_expand(gf, cur);
  5969. }
  5970. };
  5971. struct llm_build_grok : public llm_graph_context {
  5972. llm_build_grok(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5973. const int64_t n_embd_head = hparams.n_embd_head_v;
  5974. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5975. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5976. ggml_tensor * cur;
  5977. ggml_tensor * inpL;
  5978. inpL = build_inp_embd(model.tok_embd);
  5979. // inp_pos - contains the positions
  5980. ggml_tensor * inp_pos = build_inp_pos();
  5981. auto * inp_attn = build_attn_inp_kv();
  5982. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5983. for (int il = 0; il < n_layer; ++il) {
  5984. ggml_tensor * inpSA = inpL;
  5985. // norm
  5986. cur = build_norm(inpL,
  5987. model.layers[il].attn_norm, NULL,
  5988. LLM_NORM_RMS, il);
  5989. cb(cur, "attn_norm", il);
  5990. // self-attention
  5991. {
  5992. // compute Q and K and RoPE them
  5993. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5994. cb(Qcur, "Qcur", il);
  5995. if (model.layers[il].bq) {
  5996. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  5997. cb(Qcur, "Qcur", il);
  5998. }
  5999. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6000. cb(Kcur, "Kcur", il);
  6001. if (model.layers[il].bk) {
  6002. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  6003. cb(Kcur, "Kcur", il);
  6004. }
  6005. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6006. cb(Vcur, "Vcur", il);
  6007. if (model.layers[il].bv) {
  6008. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  6009. cb(Vcur, "Vcur", il);
  6010. }
  6011. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6012. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6013. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6014. Qcur = ggml_rope_ext(
  6015. ctx0, Qcur, inp_pos, nullptr,
  6016. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6017. ext_factor, attn_factor, beta_fast, beta_slow
  6018. );
  6019. Kcur = ggml_rope_ext(
  6020. ctx0, Kcur, inp_pos, nullptr,
  6021. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6022. ext_factor, attn_factor, beta_fast, beta_slow
  6023. );
  6024. cb(Qcur, "Qcur", il);
  6025. cb(Kcur, "Kcur", il);
  6026. cb(Vcur, "Vcur", il);
  6027. cur = build_attn(inp_attn,
  6028. model.layers[il].wo, model.layers[il].bo,
  6029. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
  6030. }
  6031. if (il == n_layer - 1 && inp_out_ids) {
  6032. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6033. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6034. }
  6035. cur = build_norm(cur,
  6036. model.layers[il].attn_out_norm, NULL,
  6037. LLM_NORM_RMS, il);
  6038. cb(cur, "attn_out_norm", il);
  6039. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6040. cb(ffn_inp, "ffn_inp", il);
  6041. // feed-forward network
  6042. cur = build_norm(ffn_inp,
  6043. model.layers[il].ffn_norm, NULL,
  6044. LLM_NORM_RMS, il);
  6045. cb(cur, "ffn_norm", il);
  6046. // MoE branch
  6047. ggml_tensor * moe_out = build_moe_ffn(cur,
  6048. model.layers[il].ffn_gate_inp,
  6049. model.layers[il].ffn_up_exps,
  6050. model.layers[il].ffn_gate_exps,
  6051. model.layers[il].ffn_down_exps,
  6052. nullptr,
  6053. n_expert, n_expert_used,
  6054. LLM_FFN_GELU, true,
  6055. false, 0.0,
  6056. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  6057. il);
  6058. cb(moe_out, "ffn_moe_out", il);
  6059. if (model.layers[il].ffn_up) {
  6060. ggml_tensor * ffn_out = build_ffn(cur,
  6061. model.layers[il].ffn_up, NULL, NULL,
  6062. model.layers[il].ffn_gate, NULL, NULL,
  6063. model.layers[il].ffn_down, NULL, NULL,
  6064. NULL,
  6065. LLM_FFN_GELU, LLM_FFN_PAR, il);
  6066. cb(ffn_out, "ffn_out", il);
  6067. cur = ggml_scale(ctx0, ggml_add(ctx0, ffn_out, moe_out), std::sqrt(2) / 2);
  6068. cb(cur, "ffn_out", il);
  6069. } else {
  6070. cur = moe_out;
  6071. }
  6072. cur = build_norm(cur,
  6073. model.layers[il].ffn_post_norm, NULL,
  6074. LLM_NORM_RMS, il);
  6075. cb(cur, "ffn_post_norm", il);
  6076. cur = ggml_add(ctx0, cur, ffn_inp);
  6077. cb(cur, "ffn_out", il);
  6078. cur = build_cvec(cur, il);
  6079. cb(cur, "l_out", il);
  6080. // input for next layer
  6081. inpL = cur;
  6082. }
  6083. cur = inpL;
  6084. cur = build_norm(cur,
  6085. model.output_norm, NULL,
  6086. LLM_NORM_RMS, -1);
  6087. cb(cur, "result_norm", -1);
  6088. res->t_embd = cur;
  6089. // lm_head
  6090. cur = build_lora_mm(model.output, cur);
  6091. cur = ggml_scale(ctx0, cur, hparams.f_logit_scale);
  6092. // final logit soft-capping
  6093. if (hparams.f_final_logit_softcapping) {
  6094. cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
  6095. cur = ggml_tanh(ctx0, cur);
  6096. cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
  6097. }
  6098. cb(cur, "result_output", -1);
  6099. res->t_logits = cur;
  6100. ggml_build_forward_expand(gf, cur);
  6101. }
  6102. };
  6103. struct llm_build_dbrx : public llm_graph_context {
  6104. llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6105. const int64_t n_embd_head = hparams.n_embd_head_v;
  6106. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  6107. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6108. GGML_ASSERT(n_embd_head == hparams.n_rot);
  6109. ggml_tensor * cur;
  6110. ggml_tensor * inpL;
  6111. inpL = build_inp_embd(model.tok_embd);
  6112. // inp_pos - contains the positions
  6113. ggml_tensor * inp_pos = build_inp_pos();
  6114. auto * inp_attn = build_attn_inp_kv();
  6115. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6116. for (int il = 0; il < n_layer; ++il) {
  6117. ggml_tensor * inpSA = inpL;
  6118. // norm
  6119. cur = build_norm(inpL,
  6120. model.layers[il].attn_norm, NULL,
  6121. LLM_NORM, il);
  6122. cb(cur, "attn_norm", il);
  6123. // self-attention
  6124. {
  6125. ggml_tensor * Qcur = nullptr;
  6126. ggml_tensor * Kcur = nullptr;
  6127. ggml_tensor * Vcur = nullptr;
  6128. cur = build_lora_mm(model.layers[il].wqkv, cur);
  6129. cb(cur, "wqkv", il);
  6130. cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  6131. cb(cur, "wqkv_clamped", il);
  6132. Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  6133. Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  6134. Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  6135. Qcur = ggml_rope_ext(
  6136. ctx0, Qcur, inp_pos, nullptr,
  6137. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6138. ext_factor, attn_factor, beta_fast, beta_slow
  6139. );
  6140. Kcur = ggml_rope_ext(
  6141. ctx0, Kcur, inp_pos, nullptr,
  6142. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6143. ext_factor, attn_factor, beta_fast, beta_slow
  6144. );
  6145. cb(Qcur, "Qcur", il);
  6146. cb(Kcur, "Kcur", il);
  6147. cb(Vcur, "Vcur", il);
  6148. cur = build_attn(inp_attn,
  6149. model.layers[il].wo, NULL,
  6150. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6151. }
  6152. if (il == n_layer - 1 && inp_out_ids) {
  6153. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6154. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6155. }
  6156. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6157. cb(ffn_inp, "ffn_inp", il);
  6158. // feed-forward network
  6159. // MoE branch
  6160. cur = build_norm(ffn_inp,
  6161. model.layers[il].attn_out_norm, NULL,
  6162. LLM_NORM, il);
  6163. cb(cur, "attn_out_norm", il);
  6164. cur = build_moe_ffn(cur,
  6165. model.layers[il].ffn_gate_inp,
  6166. model.layers[il].ffn_up_exps,
  6167. model.layers[il].ffn_gate_exps,
  6168. model.layers[il].ffn_down_exps,
  6169. nullptr,
  6170. n_expert, n_expert_used,
  6171. LLM_FFN_SILU, true,
  6172. false, 0.0,
  6173. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  6174. il);
  6175. cb(cur, "ffn_moe_out", il);
  6176. cur = ggml_add(ctx0, cur, ffn_inp);
  6177. cb(cur, "ffn_out", il);
  6178. cur = build_cvec(cur, il);
  6179. cb(cur, "l_out", il);
  6180. // input for next layer
  6181. inpL = cur;
  6182. }
  6183. cur = inpL;
  6184. cur = build_norm(cur,
  6185. model.output_norm, NULL,
  6186. LLM_NORM, -1);
  6187. cb(cur, "result_norm", -1);
  6188. res->t_embd = cur;
  6189. // lm_head
  6190. cur = build_lora_mm(model.output, cur);
  6191. cb(cur, "result_output", -1);
  6192. res->t_logits = cur;
  6193. ggml_build_forward_expand(gf, cur);
  6194. }
  6195. };
  6196. struct llm_build_starcoder : public llm_graph_context {
  6197. llm_build_starcoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6198. const int64_t n_embd_head = hparams.n_embd_head_v;
  6199. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  6200. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6201. ggml_tensor * cur;
  6202. ggml_tensor * inpL;
  6203. inpL = build_inp_embd(model.tok_embd);
  6204. // inp_pos - contains the positions
  6205. ggml_tensor * inp_pos = build_inp_pos();
  6206. auto * inp_attn = build_attn_inp_kv();
  6207. ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
  6208. cb(pos, "pos_embd", -1);
  6209. inpL = ggml_add(ctx0, inpL, pos);
  6210. cb(inpL, "inpL", -1);
  6211. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6212. for (int il = 0; il < n_layer; ++il) {
  6213. cur = build_norm(inpL,
  6214. model.layers[il].attn_norm,
  6215. model.layers[il].attn_norm_b,
  6216. LLM_NORM, il);
  6217. cb(cur, "attn_norm", il);
  6218. // self-attention
  6219. {
  6220. cur = build_lora_mm(model.layers[il].wqkv, cur);
  6221. cb(cur, "wqkv", il);
  6222. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  6223. cb(cur, "bqkv", il);
  6224. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  6225. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  6226. ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  6227. cb(Qcur, "Qcur", il);
  6228. cb(Kcur, "Kcur", il);
  6229. cb(Vcur, "Vcur", il);
  6230. cur = build_attn(inp_attn,
  6231. model.layers[il].wo, model.layers[il].bo,
  6232. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6233. }
  6234. if (il == n_layer - 1 && inp_out_ids) {
  6235. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6236. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6237. }
  6238. // add the input
  6239. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  6240. cb(ffn_inp, "ffn_inp", il);
  6241. // FF
  6242. {
  6243. cur = build_norm(ffn_inp,
  6244. model.layers[il].ffn_norm,
  6245. model.layers[il].ffn_norm_b,
  6246. LLM_NORM, il);
  6247. cb(cur, "ffn_norm", il);
  6248. cur = build_ffn(cur,
  6249. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  6250. NULL, NULL, NULL,
  6251. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  6252. NULL,
  6253. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  6254. cb(cur, "ffn_out", il);
  6255. }
  6256. cur = ggml_add(ctx0, cur, ffn_inp);
  6257. cur = build_cvec(cur, il);
  6258. cb(cur, "l_out", il);
  6259. // input for next layer
  6260. inpL = cur;
  6261. }
  6262. cur = build_norm(inpL,
  6263. model.output_norm,
  6264. model.output_norm_b,
  6265. LLM_NORM, -1);
  6266. cb(cur, "result_norm", -1);
  6267. res->t_embd = cur;
  6268. cur = build_lora_mm(model.output, cur);
  6269. cb(cur, "result_output", -1);
  6270. res->t_logits = cur;
  6271. ggml_build_forward_expand(gf, cur);
  6272. }
  6273. };
  6274. struct llm_build_refact : public llm_graph_context {
  6275. llm_build_refact(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6276. const int64_t n_embd_head = hparams.n_embd_head_v;
  6277. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6278. ggml_tensor * cur;
  6279. ggml_tensor * inpL;
  6280. inpL = build_inp_embd(model.tok_embd);
  6281. auto * inp_attn = build_attn_inp_kv();
  6282. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6283. for (int il = 0; il < n_layer; ++il) {
  6284. ggml_tensor * inpSA = inpL;
  6285. cur = build_norm(inpL,
  6286. model.layers[il].attn_norm, NULL,
  6287. LLM_NORM_RMS, il);
  6288. cb(cur, "attn_norm", il);
  6289. // self-attention
  6290. {
  6291. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6292. cb(Qcur, "Qcur", il);
  6293. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6294. cb(Kcur, "Kcur", il);
  6295. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6296. cb(Vcur, "Vcur", il);
  6297. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6298. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6299. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6300. cb(Qcur, "Qcur", il);
  6301. cb(Kcur, "Kcur", il);
  6302. cb(Vcur, "Vcur", il);
  6303. cur = build_attn(inp_attn,
  6304. model.layers[il].wo, NULL,
  6305. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6306. }
  6307. if (il == n_layer - 1 && inp_out_ids) {
  6308. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6309. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6310. }
  6311. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6312. cb(ffn_inp, "ffn_inp", il);
  6313. // feed-forward network
  6314. {
  6315. cur = build_norm(ffn_inp,
  6316. model.layers[il].ffn_norm, NULL,
  6317. LLM_NORM_RMS, il);
  6318. cb(cur, "ffn_norm", il);
  6319. cur = build_ffn(cur,
  6320. model.layers[il].ffn_up, NULL, NULL,
  6321. model.layers[il].ffn_gate, NULL, NULL,
  6322. model.layers[il].ffn_down, NULL, NULL,
  6323. NULL,
  6324. LLM_FFN_SILU, LLM_FFN_PAR, il);
  6325. cb(cur, "ffn_out", il);
  6326. }
  6327. cur = ggml_add(ctx0, cur, ffn_inp);
  6328. cur = build_cvec(cur, il);
  6329. cb(cur, "l_out", il);
  6330. // input for next layer
  6331. inpL = cur;
  6332. }
  6333. cur = inpL;
  6334. cur = build_norm(cur,
  6335. model.output_norm, NULL,
  6336. LLM_NORM_RMS, -1);
  6337. cb(cur, "result_norm", -1);
  6338. res->t_embd = cur;
  6339. // lm_head
  6340. cur = build_lora_mm(model.output, cur);
  6341. cb(cur, "result_output", -1);
  6342. res->t_logits = cur;
  6343. ggml_build_forward_expand(gf, cur);
  6344. }
  6345. };
  6346. struct llm_build_bert : public llm_graph_context {
  6347. llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6348. const int64_t n_embd_head = hparams.n_embd_head_v;
  6349. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  6350. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6351. ggml_tensor * cur;
  6352. ggml_tensor * inpL;
  6353. ggml_tensor * inp_pos = nullptr;
  6354. if (model.arch != LLM_ARCH_JINA_BERT_V2) {
  6355. inp_pos = build_inp_pos();
  6356. }
  6357. // construct input embeddings (token, type, position)
  6358. inpL = build_inp_embd(model.tok_embd);
  6359. // token types are hardcoded to zero ("Sentence A")
  6360. if (model.type_embd) {
  6361. ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
  6362. inpL = ggml_add(ctx0, inpL, type_row0);
  6363. }
  6364. if (model.arch == LLM_ARCH_BERT) {
  6365. inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
  6366. }
  6367. cb(inpL, "inp_embd", -1);
  6368. // embed layer norm
  6369. inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
  6370. cb(inpL, "inp_norm", -1);
  6371. auto * inp_attn = build_attn_inp_no_cache();
  6372. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6373. for (int il = 0; il < n_layer; ++il) {
  6374. ggml_tensor * cur = inpL;
  6375. {
  6376. ggml_tensor * Qcur;
  6377. ggml_tensor * Kcur;
  6378. ggml_tensor * Vcur;
  6379. // self-attention
  6380. if (model.layers[il].wqkv) {
  6381. cur = build_lora_mm(model.layers[il].wqkv, cur);
  6382. cb(cur, "wqkv", il);
  6383. if (model.layers[il].bqkv) {
  6384. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  6385. cb(cur, "bqkv", il);
  6386. }
  6387. Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  6388. Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  6389. Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  6390. } else {
  6391. Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
  6392. Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
  6393. Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
  6394. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6395. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6396. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6397. }
  6398. if (model.layers[il].attn_q_norm) {
  6399. Qcur = build_norm(Qcur,
  6400. model.layers[il].attn_q_norm,
  6401. model.layers[il].attn_q_norm_b,
  6402. LLM_NORM, il);
  6403. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6404. }
  6405. if (model.layers[il].attn_k_norm) {
  6406. Kcur = build_norm(Kcur,
  6407. model.layers[il].attn_k_norm,
  6408. model.layers[il].attn_k_norm_b,
  6409. LLM_NORM, il);
  6410. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6411. }
  6412. // RoPE
  6413. if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) {
  6414. Qcur = ggml_rope_ext(
  6415. ctx0, Qcur, inp_pos, nullptr,
  6416. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6417. ext_factor, attn_factor, beta_fast, beta_slow
  6418. );
  6419. Kcur = ggml_rope_ext(
  6420. ctx0, Kcur, inp_pos, nullptr,
  6421. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6422. ext_factor, attn_factor, beta_fast, beta_slow
  6423. );
  6424. }
  6425. cb(Qcur, "Qcur", il);
  6426. cb(Kcur, "Kcur", il);
  6427. cb(Vcur, "Vcur", il);
  6428. cur = build_attn(inp_attn,
  6429. model.layers[il].wo, model.layers[il].bo,
  6430. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6431. cb(cur, "kqv_out", il);
  6432. }
  6433. if (il == n_layer - 1 && inp_out_ids) {
  6434. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6435. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6436. }
  6437. // re-add the layer input
  6438. cur = ggml_add(ctx0, cur, inpL);
  6439. // attention layer norm
  6440. cur = build_norm(cur, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, il);
  6441. if (model.layers[il].attn_norm_2 != nullptr) {
  6442. cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
  6443. cur = build_norm(cur, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, il);
  6444. }
  6445. ggml_tensor * ffn_inp = cur;
  6446. cb(ffn_inp, "ffn_inp", il);
  6447. // feed-forward network
  6448. if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
  6449. // MoE branch
  6450. cur = build_moe_ffn(cur,
  6451. model.layers[il].ffn_gate_inp,
  6452. model.layers[il].ffn_up_exps,
  6453. nullptr,
  6454. model.layers[il].ffn_down_exps,
  6455. nullptr,
  6456. hparams.n_expert,
  6457. hparams.n_expert_used,
  6458. LLM_FFN_GELU,
  6459. false, false,
  6460. 0.0f,
  6461. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
  6462. cb(cur, "ffn_moe_out", il);
  6463. } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) {
  6464. cur = build_ffn(cur,
  6465. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  6466. NULL, NULL, NULL,
  6467. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  6468. NULL,
  6469. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  6470. cb(cur, "ffn_out", il);
  6471. } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
  6472. cur = build_ffn(cur,
  6473. model.layers[il].ffn_up, NULL, NULL,
  6474. model.layers[il].ffn_gate, NULL, NULL,
  6475. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  6476. NULL,
  6477. model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
  6478. cb(cur, "ffn_out", il);
  6479. } else {
  6480. cur = build_ffn(cur,
  6481. model.layers[il].ffn_up, NULL, NULL,
  6482. model.layers[il].ffn_gate, NULL, NULL,
  6483. model.layers[il].ffn_down, NULL, NULL,
  6484. NULL,
  6485. LLM_FFN_SILU, LLM_FFN_PAR, il);
  6486. cb(cur, "ffn_out", il);
  6487. }
  6488. // attentions bypass the intermediate layer
  6489. cur = ggml_add(ctx0, cur, ffn_inp);
  6490. // output layer norm
  6491. cur = build_norm(cur, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, il);
  6492. // input for next layer
  6493. inpL = cur;
  6494. }
  6495. cur = inpL;
  6496. cb(cur, "result_embd", -1);
  6497. res->t_embd = cur;
  6498. ggml_build_forward_expand(gf, cur);
  6499. }
  6500. };
  6501. struct llm_build_neo_bert : public llm_graph_context {
  6502. llm_build_neo_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6503. const int64_t n_embd_head = hparams.n_embd_head_v;
  6504. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  6505. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6506. ggml_tensor * cur;
  6507. ggml_tensor * inpL;
  6508. ggml_tensor * inp_pos = build_inp_pos();
  6509. // construct input embeddings (token, type, position)
  6510. inpL = build_inp_embd(model.tok_embd);
  6511. cb(inpL, "inp_embd", -1);
  6512. auto * inp_attn = build_attn_inp_no_cache();
  6513. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6514. for (int il = 0; il < n_layer; ++il) {
  6515. ggml_tensor * cur = inpL;
  6516. // pre-norm
  6517. cur = build_norm(inpL,
  6518. model.layers[il].attn_norm, NULL,
  6519. LLM_NORM_RMS, il);
  6520. {
  6521. ggml_tensor * Qcur;
  6522. ggml_tensor * Kcur;
  6523. ggml_tensor * Vcur;
  6524. // self-attention
  6525. cur = build_lora_mm(model.layers[il].wqkv, cur);
  6526. cb(cur, "wqkv", il);
  6527. Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  6528. Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  6529. Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  6530. // RoPE
  6531. Qcur = ggml_rope_ext(
  6532. ctx0, Qcur, inp_pos, nullptr,
  6533. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6534. ext_factor, attn_factor, beta_fast, beta_slow
  6535. );
  6536. Kcur = ggml_rope_ext(
  6537. ctx0, Kcur, inp_pos, nullptr,
  6538. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6539. ext_factor, attn_factor, beta_fast, beta_slow
  6540. );
  6541. cb(Qcur, "Qcur", il);
  6542. cb(Kcur, "Kcur", il);
  6543. cb(Vcur, "Vcur", il);
  6544. cur = build_attn(inp_attn,
  6545. model.layers[il].wo, nullptr,
  6546. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6547. cb(cur, "kqv_out", il);
  6548. }
  6549. if (il == n_layer - 1 && inp_out_ids) {
  6550. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6551. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6552. }
  6553. // re-add the layer input
  6554. cur = ggml_add(ctx0, cur, inpL);
  6555. ggml_tensor * ffn_inp = cur;
  6556. cb(ffn_inp, "ffn_inp", il);
  6557. // pre-norm
  6558. cur = build_norm(ffn_inp,
  6559. model.layers[il].ffn_norm, NULL,
  6560. LLM_NORM_RMS, il);
  6561. cb(cur, "ffn_norm", il);
  6562. // feed-forward network
  6563. cur = build_ffn(cur,
  6564. model.layers[il].ffn_up,
  6565. NULL, NULL, NULL, NULL, NULL,
  6566. model.layers[il].ffn_down,
  6567. NULL, NULL, NULL,
  6568. LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
  6569. // attentions bypass the intermediate layer
  6570. cur = ggml_add(ctx0, cur, ffn_inp);
  6571. // input for next layer
  6572. inpL = cur;
  6573. }
  6574. cur = inpL;
  6575. cur = build_norm(cur,
  6576. model.output_norm_enc, NULL,
  6577. LLM_NORM_RMS, -1);
  6578. cb(cur, "result_embd", -1);
  6579. res->t_embd = cur;
  6580. ggml_build_forward_expand(gf, cur);
  6581. }
  6582. };
  6583. struct llm_build_bloom : public llm_graph_context {
  6584. llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6585. const int64_t n_embd_head = hparams.n_embd_head_v;
  6586. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  6587. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6588. ggml_tensor * cur;
  6589. ggml_tensor * inpL;
  6590. inpL = build_inp_embd(model.tok_embd);
  6591. auto * inp_attn = build_attn_inp_kv();
  6592. inpL = build_norm(inpL,
  6593. model.tok_norm,
  6594. model.tok_norm_b,
  6595. LLM_NORM, -1);
  6596. cb(inpL, "inp_norm", -1);
  6597. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6598. for (int il = 0; il < n_layer; ++il) {
  6599. cur = build_norm(inpL,
  6600. model.layers[il].attn_norm,
  6601. model.layers[il].attn_norm_b,
  6602. LLM_NORM, il);
  6603. cb(cur, "attn_norm", il);
  6604. // self-attention
  6605. {
  6606. cur = build_lora_mm(model.layers[il].wqkv, cur);
  6607. cb(cur, "wqkv", il);
  6608. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  6609. cb(cur, "bqkv", il);
  6610. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  6611. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  6612. ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  6613. cb(Qcur, "Qcur", il);
  6614. cb(Kcur, "Kcur", il);
  6615. cb(Vcur, "Vcur", il);
  6616. cur = build_attn(inp_attn,
  6617. model.layers[il].wo, model.layers[il].bo,
  6618. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6619. }
  6620. if (il == n_layer - 1 && inp_out_ids) {
  6621. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6622. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6623. }
  6624. // Add the input
  6625. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  6626. cb(ffn_inp, "ffn_inp", il);
  6627. // FF
  6628. {
  6629. cur = build_norm(ffn_inp,
  6630. model.layers[il].ffn_norm,
  6631. model.layers[il].ffn_norm_b,
  6632. LLM_NORM, il);
  6633. cb(cur, "ffn_norm", il);
  6634. cur = build_ffn(cur,
  6635. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  6636. NULL, NULL, NULL,
  6637. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  6638. NULL,
  6639. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  6640. cb(cur, "ffn_out", il);
  6641. }
  6642. cur = ggml_add(ctx0, cur, ffn_inp);
  6643. cur = build_cvec(cur, il);
  6644. cb(cur, "l_out", il);
  6645. // input for next layer
  6646. inpL = cur;
  6647. }
  6648. cur = build_norm(inpL,
  6649. model.output_norm,
  6650. model.output_norm_b,
  6651. LLM_NORM, -1);
  6652. cb(cur, "result_norm", -1);
  6653. res->t_embd = cur;
  6654. cur = build_lora_mm(model.output, cur);
  6655. cb(cur, "result_output", -1);
  6656. res->t_logits = cur;
  6657. ggml_build_forward_expand(gf, cur);
  6658. }
  6659. };
  6660. struct llm_build_mpt : public llm_graph_context {
  6661. llm_build_mpt(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6662. const int64_t n_embd_head = hparams.n_embd_head_v;
  6663. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  6664. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6665. ggml_tensor * cur;
  6666. ggml_tensor * pos;
  6667. ggml_tensor * inpL;
  6668. inpL = build_inp_embd(model.tok_embd);
  6669. auto * inp_attn = build_attn_inp_kv();
  6670. if (model.pos_embd) {
  6671. // inp_pos - contains the positions
  6672. ggml_tensor * inp_pos = build_inp_pos();
  6673. pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
  6674. cb(pos, "pos_embd", -1);
  6675. inpL = ggml_add(ctx0, inpL, pos);
  6676. cb(inpL, "inpL", -1);
  6677. }
  6678. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6679. for (int il = 0; il < n_layer; ++il) {
  6680. ggml_tensor * attn_norm;
  6681. attn_norm = build_norm(inpL,
  6682. model.layers[il].attn_norm,
  6683. model.layers[il].attn_norm_b,
  6684. LLM_NORM, il);
  6685. cb(attn_norm, "attn_norm", il);
  6686. // self-attention
  6687. {
  6688. cur = attn_norm;
  6689. cur = build_lora_mm(model.layers[il].wqkv, cur);
  6690. cb(cur, "wqkv", il);
  6691. if (model.layers[il].bqkv){
  6692. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  6693. cb(cur, "bqkv", il);
  6694. }
  6695. if (hparams.f_clamp_kqv > 0.0f) {
  6696. cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  6697. cb(cur, "wqkv_clamped", il);
  6698. }
  6699. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  6700. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  6701. ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  6702. // Q/K Layernorm
  6703. if (model.layers[il].attn_q_norm) {
  6704. Qcur = build_norm(Qcur,
  6705. model.layers[il].attn_q_norm,
  6706. model.layers[il].attn_q_norm_b,
  6707. LLM_NORM, il);
  6708. Kcur = build_norm(Kcur,
  6709. model.layers[il].attn_k_norm,
  6710. model.layers[il].attn_k_norm_b,
  6711. LLM_NORM, il);
  6712. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6713. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6714. }
  6715. cb(Qcur, "Qcur", il);
  6716. cb(Kcur, "Kcur", il);
  6717. cb(Vcur, "Vcur", il);
  6718. cur = build_attn(inp_attn,
  6719. model.layers[il].wo, model.layers[il].bo,
  6720. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6721. }
  6722. if (il == n_layer - 1 && inp_out_ids) {
  6723. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6724. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6725. }
  6726. // Add the input
  6727. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  6728. cb(ffn_inp, "ffn_inp", il);
  6729. // feed forward
  6730. {
  6731. cur = build_norm(ffn_inp,
  6732. model.layers[il].ffn_norm,
  6733. model.layers[il].ffn_norm_b,
  6734. LLM_NORM, il);
  6735. cb(cur, "ffn_norm", il);
  6736. cur = build_ffn(cur,
  6737. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  6738. NULL, NULL, NULL,
  6739. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  6740. model.layers[il].ffn_act,
  6741. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  6742. cb(cur, "ffn_out", il);
  6743. }
  6744. cur = ggml_add(ctx0, cur, ffn_inp);
  6745. cur = build_cvec(cur, il);
  6746. cb(cur, "l_out", il);
  6747. // input for next layer
  6748. inpL = cur;
  6749. }
  6750. cur = inpL;
  6751. cur = build_norm(cur,
  6752. model.output_norm,
  6753. model.output_norm_b,
  6754. LLM_NORM, -1);
  6755. cb(cur, "result_norm", -1);
  6756. res->t_embd = cur;
  6757. cur = build_lora_mm(model.output, cur);
  6758. cb(cur, "result_output", -1);
  6759. res->t_logits = cur;
  6760. ggml_build_forward_expand(gf, cur);
  6761. }
  6762. };
  6763. struct llm_build_stablelm : public llm_graph_context {
  6764. llm_build_stablelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6765. const int64_t n_embd_head = hparams.n_embd_head_v;
  6766. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6767. ggml_tensor * cur;
  6768. ggml_tensor * inpL;
  6769. inpL = build_inp_embd(model.tok_embd);
  6770. // inp_pos - contains the positions
  6771. ggml_tensor * inp_pos = build_inp_pos();
  6772. auto * inp_attn = build_attn_inp_kv();
  6773. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6774. for (int il = 0; il < n_layer; ++il) {
  6775. // norm
  6776. cur = build_norm(inpL,
  6777. model.layers[il].attn_norm,
  6778. model.layers[il].attn_norm_b,
  6779. LLM_NORM, il);
  6780. cb(cur, "attn_norm", il);
  6781. ggml_tensor * inpSA = cur;
  6782. // self-attention
  6783. {
  6784. // compute Q and K and RoPE them
  6785. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6786. cb(Qcur, "Qcur", il);
  6787. if (model.layers[il].bq) {
  6788. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  6789. cb(Qcur, "Qcur", il);
  6790. }
  6791. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6792. cb(Kcur, "Kcur", il);
  6793. if (model.layers[il].bk) {
  6794. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  6795. cb(Kcur, "Kcur", il);
  6796. }
  6797. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6798. cb(Vcur, "Vcur", il);
  6799. if (model.layers[il].bv) {
  6800. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  6801. cb(Vcur, "Vcur", il);
  6802. }
  6803. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6804. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6805. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6806. if (model.layers[il].attn_q_norm) {
  6807. Qcur = build_norm(Qcur,
  6808. model.layers[il].attn_q_norm,
  6809. NULL,
  6810. LLM_NORM, il);
  6811. cb(Qcur, "Qcur", il);
  6812. }
  6813. if (model.layers[il].attn_k_norm) {
  6814. Kcur = build_norm(Kcur,
  6815. model.layers[il].attn_k_norm,
  6816. NULL,
  6817. LLM_NORM, il);
  6818. cb(Kcur, "Kcur", il);
  6819. }
  6820. Qcur = ggml_rope_ext(
  6821. ctx0, Qcur, inp_pos, nullptr,
  6822. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6823. ext_factor, attn_factor, beta_fast, beta_slow
  6824. );
  6825. Kcur = ggml_rope_ext(
  6826. ctx0, Kcur, inp_pos, nullptr,
  6827. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6828. ext_factor, attn_factor, beta_fast, beta_slow
  6829. );
  6830. cb(Qcur, "Qcur", il);
  6831. cb(Kcur, "Kcur", il);
  6832. cb(Vcur, "Vcur", il);
  6833. cur = build_attn(inp_attn,
  6834. model.layers[il].wo, NULL,
  6835. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6836. }
  6837. if (il == n_layer - 1 && inp_out_ids) {
  6838. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6839. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6840. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6841. }
  6842. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  6843. cb(ffn_inp, "ffn_inp", il);
  6844. // feed-forward network
  6845. {
  6846. if (model.layers[il].ffn_norm) {
  6847. cur = build_norm(ffn_inp,
  6848. model.layers[il].ffn_norm,
  6849. model.layers[il].ffn_norm_b,
  6850. LLM_NORM, il);
  6851. cb(cur, "ffn_norm", il);
  6852. } else {
  6853. // parallel residual
  6854. cur = inpSA;
  6855. }
  6856. cur = build_ffn(cur,
  6857. model.layers[il].ffn_up, NULL, NULL,
  6858. model.layers[il].ffn_gate, NULL, NULL,
  6859. model.layers[il].ffn_down, NULL, NULL,
  6860. NULL,
  6861. LLM_FFN_SILU, LLM_FFN_PAR, il);
  6862. cb(cur, "ffn_out", il);
  6863. }
  6864. cur = ggml_add(ctx0, cur, ffn_inp);
  6865. cur = build_cvec(cur, il);
  6866. cb(cur, "l_out", il);
  6867. // input for next layer
  6868. inpL = cur;
  6869. }
  6870. cur = inpL;
  6871. cur = build_norm(cur,
  6872. model.output_norm,
  6873. model.output_norm_b,
  6874. LLM_NORM, -1);
  6875. cb(cur, "result_norm", -1);
  6876. res->t_embd = cur;
  6877. // lm_head
  6878. cur = build_lora_mm(model.output, cur);
  6879. cb(cur, "result_output", -1);
  6880. res->t_logits = cur;
  6881. ggml_build_forward_expand(gf, cur);
  6882. }
  6883. };
  6884. struct llm_build_qwen : public llm_graph_context {
  6885. llm_build_qwen(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6886. const int64_t n_embd_head = hparams.n_embd_head_v;
  6887. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6888. ggml_tensor * cur;
  6889. ggml_tensor * inpL;
  6890. inpL = build_inp_embd(model.tok_embd);
  6891. // inp_pos - contains the positions
  6892. ggml_tensor * inp_pos = build_inp_pos();
  6893. auto * inp_attn = build_attn_inp_kv();
  6894. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6895. for (int il = 0; il < n_layer; ++il) {
  6896. ggml_tensor * inpSA = inpL;
  6897. cur = build_norm(inpL,
  6898. model.layers[il].attn_norm, NULL,
  6899. LLM_NORM_RMS, il);
  6900. cb(cur, "attn_norm", il);
  6901. // self-attention
  6902. {
  6903. cur = build_lora_mm(model.layers[il].wqkv, cur);
  6904. cb(cur, "wqkv", il);
  6905. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  6906. cb(cur, "bqkv", il);
  6907. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  6908. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  6909. ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 2*sizeof(float)*(n_embd));
  6910. // using mode = 2 for neox mode
  6911. Qcur = ggml_rope_ext(
  6912. ctx0, Qcur, inp_pos, nullptr,
  6913. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6914. ext_factor, attn_factor, beta_fast, beta_slow
  6915. );
  6916. Kcur = ggml_rope_ext(
  6917. ctx0, Kcur, inp_pos, nullptr,
  6918. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6919. ext_factor, attn_factor, beta_fast, beta_slow
  6920. );
  6921. cb(Qcur, "Qcur", il);
  6922. cb(Kcur, "Kcur", il);
  6923. cb(Vcur, "Vcur", il);
  6924. cur = build_attn(inp_attn,
  6925. model.layers[il].wo, NULL,
  6926. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6927. }
  6928. if (il == n_layer - 1 && inp_out_ids) {
  6929. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6930. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6931. }
  6932. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6933. cb(ffn_inp, "ffn_inp", il);
  6934. // feed-forward forward
  6935. {
  6936. cur = build_norm(ffn_inp,
  6937. model.layers[il].ffn_norm, NULL,
  6938. LLM_NORM_RMS, il);
  6939. cb(cur, "ffn_norm", il);
  6940. cur = build_ffn(cur,
  6941. model.layers[il].ffn_up, NULL, NULL,
  6942. model.layers[il].ffn_gate, NULL, NULL,
  6943. model.layers[il].ffn_down, NULL, NULL,
  6944. NULL,
  6945. LLM_FFN_SILU, LLM_FFN_PAR, il);
  6946. cb(cur, "ffn_out", il);
  6947. }
  6948. cur = ggml_add(ctx0, cur, ffn_inp);
  6949. cur = build_cvec(cur, il);
  6950. cb(cur, "l_out", il);
  6951. // input for next layer
  6952. inpL = cur;
  6953. }
  6954. cur = inpL;
  6955. cur = build_norm(cur,
  6956. model.output_norm, NULL,
  6957. LLM_NORM_RMS, -1);
  6958. cb(cur, "result_norm", -1);
  6959. res->t_embd = cur;
  6960. // lm_head
  6961. cur = build_lora_mm(model.output, cur);
  6962. cb(cur, "result_output", -1);
  6963. res->t_logits = cur;
  6964. ggml_build_forward_expand(gf, cur);
  6965. }
  6966. };
  6967. struct llm_build_qwen2 : public llm_graph_context {
  6968. llm_build_qwen2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6969. const int64_t n_embd_head = hparams.n_embd_head_v;
  6970. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6971. GGML_ASSERT(n_embd_head == hparams.n_rot);
  6972. ggml_tensor * cur;
  6973. ggml_tensor * inpL;
  6974. inpL = build_inp_embd(model.tok_embd);
  6975. // inp_pos - contains the positions
  6976. ggml_tensor * inp_pos = build_inp_pos();
  6977. auto * inp_attn = build_attn_inp_kv();
  6978. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6979. for (int il = 0; il < n_layer; ++il) {
  6980. ggml_tensor * inpSA = inpL;
  6981. // norm
  6982. cur = build_norm(inpL,
  6983. model.layers[il].attn_norm, NULL,
  6984. LLM_NORM_RMS, il);
  6985. cb(cur, "attn_norm", il);
  6986. // self-attention
  6987. {
  6988. // compute Q and K and RoPE them
  6989. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6990. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  6991. cb(Qcur, "Qcur", il);
  6992. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6993. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  6994. cb(Kcur, "Kcur", il);
  6995. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6996. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  6997. cb(Vcur, "Vcur", il);
  6998. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6999. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7000. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7001. Qcur = ggml_rope_ext(
  7002. ctx0, Qcur, inp_pos, nullptr,
  7003. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7004. ext_factor, attn_factor, beta_fast, beta_slow
  7005. );
  7006. Kcur = ggml_rope_ext(
  7007. ctx0, Kcur, inp_pos, nullptr,
  7008. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7009. ext_factor, attn_factor, beta_fast, beta_slow
  7010. );
  7011. cb(Qcur, "Qcur", il);
  7012. cb(Kcur, "Kcur", il);
  7013. cb(Vcur, "Vcur", il);
  7014. cur = build_attn(inp_attn,
  7015. model.layers[il].wo, model.layers[il].bo,
  7016. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7017. }
  7018. if (il == n_layer - 1 && inp_out_ids) {
  7019. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7020. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7021. }
  7022. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7023. cb(ffn_inp, "ffn_inp", il);
  7024. // feed-forward network
  7025. cur = build_norm(ffn_inp,
  7026. model.layers[il].ffn_norm, NULL,
  7027. LLM_NORM_RMS, il);
  7028. cb(cur, "ffn_norm", il);
  7029. cur = build_ffn(cur,
  7030. model.layers[il].ffn_up, NULL, NULL,
  7031. model.layers[il].ffn_gate, NULL, NULL,
  7032. model.layers[il].ffn_down, NULL, NULL,
  7033. NULL,
  7034. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7035. cb(cur, "ffn_out", il);
  7036. cur = ggml_add(ctx0, cur, ffn_inp);
  7037. cur = build_cvec(cur, il);
  7038. cb(cur, "l_out", il);
  7039. // input for next layer
  7040. inpL = cur;
  7041. }
  7042. cur = inpL;
  7043. cur = build_norm(cur,
  7044. model.output_norm, NULL,
  7045. LLM_NORM_RMS, -1);
  7046. cb(cur, "result_norm", -1);
  7047. res->t_embd = cur;
  7048. // lm_head
  7049. cur = build_lora_mm(model.output, cur);
  7050. if (model.output_b != nullptr) {
  7051. cur = ggml_add(ctx0, cur, model.output_b);
  7052. }
  7053. cb(cur, "result_output", -1);
  7054. res->t_logits = cur;
  7055. ggml_build_forward_expand(gf, cur);
  7056. }
  7057. };
  7058. struct llm_build_dream : public llm_graph_context {
  7059. llm_build_dream(const llama_model & model, const llm_graph_params & params) :
  7060. llm_graph_context(params) {
  7061. //copied from qwen2
  7062. const int64_t n_embd_head = hparams.n_embd_head_v;
  7063. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7064. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7065. ggml_tensor * cur;
  7066. ggml_tensor * inpL;
  7067. inpL = build_inp_embd(model.tok_embd);
  7068. // inp_pos - contains the positions
  7069. ggml_tensor * inp_pos = build_inp_pos();
  7070. auto * inp_attn = build_attn_inp_no_cache();
  7071. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7072. for (int il = 0; il < n_layer; ++il) {
  7073. ggml_tensor * inpSA = inpL;
  7074. // norm
  7075. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  7076. cb(cur, "attn_norm", il);
  7077. // self-attention
  7078. {
  7079. // compute Q and K and RoPE them
  7080. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7081. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  7082. cb(Qcur, "Qcur", il);
  7083. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7084. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  7085. cb(Kcur, "Kcur", il);
  7086. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7087. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  7088. cb(Vcur, "Vcur", il);
  7089. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7090. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7091. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7092. Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7093. ext_factor, attn_factor, beta_fast, beta_slow);
  7094. Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7095. ext_factor, attn_factor, beta_fast, beta_slow);
  7096. cb(Qcur, "Qcur", il);
  7097. cb(Kcur, "Kcur", il);
  7098. cb(Vcur, "Vcur", il);
  7099. cur = build_attn(inp_attn,
  7100. model.layers[il].wo, model.layers[il].bo,
  7101. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
  7102. }
  7103. if (il == n_layer - 1 && inp_out_ids) {
  7104. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7105. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7106. }
  7107. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7108. cb(ffn_inp, "ffn_inp", il);
  7109. // feed-forward network
  7110. cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
  7111. cb(cur, "ffn_norm", il);
  7112. cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
  7113. model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
  7114. cb(cur, "ffn_out", il);
  7115. cur = ggml_add(ctx0, cur, ffn_inp);
  7116. cur = build_cvec(cur, il);
  7117. cb(cur, "l_out", il);
  7118. // input for next layer
  7119. inpL = cur;
  7120. }
  7121. cur = inpL;
  7122. cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
  7123. cb(cur, "result_norm", -1);
  7124. res->t_embd = cur;
  7125. // lm_head
  7126. cur = build_lora_mm(model.output, cur);
  7127. cb(cur, "result_output", -1);
  7128. res->t_logits = cur;
  7129. ggml_build_forward_expand(gf, cur);
  7130. }
  7131. };
  7132. struct llm_build_llada : public llm_graph_context {
  7133. llm_build_llada(const llama_model & model, const llm_graph_params & params) :
  7134. llm_graph_context(params) {
  7135. // LLaDA is similar to LLaMA but uses non-causal attention for diffusion
  7136. const int64_t n_embd_head = hparams.n_embd_head_v;
  7137. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7138. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7139. ggml_tensor * cur;
  7140. ggml_tensor * inpL;
  7141. inpL = build_inp_embd(model.tok_embd);
  7142. // inp_pos - contains the positions
  7143. ggml_tensor * inp_pos = build_inp_pos();
  7144. // Non-causal attention for diffusion
  7145. auto * inp_attn = build_attn_inp_no_cache();
  7146. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7147. for (int il = 0; il < n_layer; ++il) {
  7148. ggml_tensor * inpSA = inpL;
  7149. // norm
  7150. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  7151. cb(cur, "attn_norm", il);
  7152. // self-attention
  7153. {
  7154. // compute separate Q, K, V projections without bias, matching LLaDALlamaBlock
  7155. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7156. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7157. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7158. cb(Qcur, "Qcur", il);
  7159. cb(Kcur, "Kcur", il);
  7160. cb(Vcur, "Vcur", il);
  7161. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7162. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7163. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7164. Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7165. ext_factor, attn_factor, beta_fast, beta_slow);
  7166. Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7167. ext_factor, attn_factor, beta_fast, beta_slow);
  7168. cb(Qcur, "Qcur", il);
  7169. cb(Kcur, "Kcur", il);
  7170. cb(Vcur, "Vcur", il);
  7171. cur = build_attn(inp_attn,
  7172. model.layers[il].wo, NULL,
  7173. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
  7174. }
  7175. if (il == n_layer - 1 && inp_out_ids) {
  7176. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7177. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7178. }
  7179. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7180. cb(ffn_inp, "ffn_inp", il);
  7181. // feed-forward network
  7182. cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
  7183. cb(cur, "ffn_norm", il);
  7184. cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
  7185. model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
  7186. cb(cur, "ffn_out", il);
  7187. cur = ggml_add(ctx0, cur, ffn_inp);
  7188. cur = build_cvec(cur, il);
  7189. cb(cur, "l_out", il);
  7190. // input for next layer
  7191. inpL = cur;
  7192. }
  7193. cur = inpL;
  7194. cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
  7195. cb(cur, "result_norm", -1);
  7196. res->t_embd = cur;
  7197. // lm_head
  7198. cur = build_lora_mm(model.output, cur);
  7199. cb(cur, "result_output", -1);
  7200. res->t_logits = cur;
  7201. ggml_build_forward_expand(gf, cur);
  7202. }
  7203. };
  7204. struct llm_build_qwen2vl : public llm_graph_context {
  7205. llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7206. const int64_t n_embd_head = hparams.n_embd_head_v;
  7207. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7208. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7209. ggml_tensor * cur;
  7210. ggml_tensor * inpL;
  7211. inpL = build_inp_embd(model.tok_embd);
  7212. // inp_pos - contains the positions
  7213. ggml_tensor * inp_pos = build_inp_pos();
  7214. auto * inp_attn = build_attn_inp_kv();
  7215. int sections[4];
  7216. std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
  7217. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7218. for (int il = 0; il < n_layer; ++il) {
  7219. ggml_tensor * inpSA = inpL;
  7220. // norm
  7221. cur = build_norm(inpL,
  7222. model.layers[il].attn_norm, NULL,
  7223. LLM_NORM_RMS, il);
  7224. cb(cur, "attn_norm", il);
  7225. // self-attention
  7226. {
  7227. // compute Q and K and RoPE them
  7228. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7229. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  7230. cb(Qcur, "Qcur", il);
  7231. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7232. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  7233. cb(Kcur, "Kcur", il);
  7234. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7235. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  7236. cb(Vcur, "Vcur", il);
  7237. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7238. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7239. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7240. Qcur = ggml_rope_multi(
  7241. ctx0, Qcur, inp_pos, nullptr,
  7242. n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
  7243. ext_factor, attn_factor, beta_fast, beta_slow
  7244. );
  7245. Kcur = ggml_rope_multi(
  7246. ctx0, Kcur, inp_pos, nullptr,
  7247. n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
  7248. ext_factor, attn_factor, beta_fast, beta_slow
  7249. );
  7250. cb(Qcur, "Qcur", il);
  7251. cb(Kcur, "Kcur", il);
  7252. cb(Vcur, "Vcur", il);
  7253. cur = build_attn(inp_attn,
  7254. model.layers[il].wo, model.layers[il].bo,
  7255. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7256. }
  7257. if (il == n_layer - 1 && inp_out_ids) {
  7258. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7259. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7260. }
  7261. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7262. cb(ffn_inp, "ffn_inp", il);
  7263. // feed-forward network
  7264. cur = build_norm(ffn_inp,
  7265. model.layers[il].ffn_norm, NULL,
  7266. LLM_NORM_RMS, il);
  7267. cb(cur, "ffn_norm", il);
  7268. cur = build_ffn(cur,
  7269. model.layers[il].ffn_up, NULL, NULL,
  7270. model.layers[il].ffn_gate, NULL, NULL,
  7271. model.layers[il].ffn_down, NULL, NULL,
  7272. NULL,
  7273. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7274. cb(cur, "ffn_out", il);
  7275. cur = ggml_add(ctx0, cur, ffn_inp);
  7276. cur = build_cvec(cur, il);
  7277. cb(cur, "l_out", il);
  7278. // input for next layer
  7279. inpL = cur;
  7280. }
  7281. cur = inpL;
  7282. cur = build_norm(cur,
  7283. model.output_norm, NULL,
  7284. LLM_NORM_RMS, -1);
  7285. cb(cur, "result_norm", -1);
  7286. res->t_embd = cur;
  7287. // lm_head
  7288. cur = build_lora_mm(model.output, cur);
  7289. cb(cur, "result_output", -1);
  7290. res->t_logits = cur;
  7291. ggml_build_forward_expand(gf, cur);
  7292. }
  7293. };
  7294. struct llm_build_qwen2moe : public llm_graph_context {
  7295. llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7296. const int64_t n_embd_head = hparams.n_embd_head_v;
  7297. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7298. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7299. ggml_tensor * cur;
  7300. ggml_tensor * inpL;
  7301. inpL = build_inp_embd(model.tok_embd);
  7302. // inp_pos - contains the positions
  7303. ggml_tensor * inp_pos = build_inp_pos();
  7304. auto * inp_attn = build_attn_inp_kv();
  7305. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7306. for (int il = 0; il < n_layer; ++il) {
  7307. ggml_tensor * inpSA = inpL;
  7308. // norm
  7309. cur = build_norm(inpL,
  7310. model.layers[il].attn_norm, NULL,
  7311. LLM_NORM_RMS, il);
  7312. cb(cur, "attn_norm", il);
  7313. // self_attention
  7314. {
  7315. // compute Q and K and RoPE them
  7316. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7317. cb(Qcur, "Qcur", il);
  7318. if (model.layers[il].bq) {
  7319. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  7320. cb(Qcur, "Qcur", il);
  7321. }
  7322. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7323. cb(Kcur, "Kcur", il);
  7324. if (model.layers[il].bk) {
  7325. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  7326. cb(Kcur, "Kcur", il);
  7327. }
  7328. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7329. cb(Vcur, "Vcur", il);
  7330. if (model.layers[il].bv) {
  7331. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  7332. cb(Vcur, "Vcur", il);
  7333. }
  7334. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7335. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7336. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7337. Qcur = ggml_rope_ext(
  7338. ctx0, Qcur, inp_pos, nullptr,
  7339. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7340. ext_factor, attn_factor, beta_fast, beta_slow
  7341. );
  7342. Kcur = ggml_rope_ext(
  7343. ctx0, Kcur, inp_pos, nullptr,
  7344. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7345. ext_factor, attn_factor, beta_fast, beta_slow
  7346. );
  7347. cb(Qcur, "Qcur", il);
  7348. cb(Kcur, "Kcur", il);
  7349. cb(Vcur, "Vcur", il);
  7350. cur = build_attn(inp_attn,
  7351. model.layers[il].wo, model.layers[il].bo,
  7352. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7353. }
  7354. if (il == n_layer - 1 && inp_out_ids) {
  7355. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7356. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7357. }
  7358. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7359. cb(ffn_inp, "ffn_inp", il);
  7360. // MoE branch
  7361. cur = build_norm(ffn_inp,
  7362. model.layers[il].ffn_norm, NULL,
  7363. LLM_NORM_RMS, il);
  7364. cb(cur, "ffn_norm", il);
  7365. ggml_tensor * moe_out =
  7366. build_moe_ffn(cur,
  7367. model.layers[il].ffn_gate_inp,
  7368. model.layers[il].ffn_up_exps,
  7369. model.layers[il].ffn_gate_exps,
  7370. model.layers[il].ffn_down_exps,
  7371. nullptr,
  7372. n_expert, n_expert_used,
  7373. LLM_FFN_SILU, false,
  7374. false, 0.0,
  7375. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  7376. il);
  7377. cb(moe_out, "ffn_moe_out", il);
  7378. // FFN shared expert
  7379. {
  7380. ggml_tensor * cur_gate_inp = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur);
  7381. cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
  7382. // sigmoid
  7383. ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
  7384. cb(cur_gate, "ffn_shexp_gate", il);
  7385. ggml_tensor * cur_ffn = build_ffn(cur,
  7386. model.layers[il].ffn_up_shexp, NULL, NULL,
  7387. model.layers[il].ffn_gate_shexp, NULL, NULL,
  7388. model.layers[il].ffn_down_shexp, NULL, NULL,
  7389. NULL,
  7390. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7391. cb(cur_ffn, "ffn_shexp", il);
  7392. ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
  7393. cb(ffn_shexp_out, "ffn_shexp_out", il);
  7394. moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
  7395. cb(moe_out, "ffn_out", il);
  7396. cur = moe_out;
  7397. }
  7398. cur = ggml_add(ctx0, cur, ffn_inp);
  7399. cur = build_cvec(cur, il);
  7400. cb(cur, "l_out", il);
  7401. // input for next layer
  7402. inpL = cur;
  7403. }
  7404. cur = inpL;
  7405. cur = build_norm(cur,
  7406. model.output_norm, NULL,
  7407. LLM_NORM_RMS, -1);
  7408. cb(cur, "result_norm", -1);
  7409. res->t_embd = cur;
  7410. // lm_head
  7411. cur = build_lora_mm(model.output, cur);
  7412. cb(cur, "result_output", -1);
  7413. res->t_logits = cur;
  7414. ggml_build_forward_expand(gf, cur);
  7415. }
  7416. };
  7417. struct llm_build_qwen3 : public llm_graph_context {
  7418. llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7419. const int64_t n_embd_head = hparams.n_embd_head_v;
  7420. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7421. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7422. ggml_tensor * cur;
  7423. ggml_tensor * inpL;
  7424. inpL = build_inp_embd(model.tok_embd);
  7425. // inp_pos - contains the positions
  7426. ggml_tensor * inp_pos = build_inp_pos();
  7427. auto * inp_attn = build_attn_inp_kv();
  7428. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7429. for (int il = 0; il < n_layer; ++il) {
  7430. ggml_tensor * inpSA = inpL;
  7431. // norm
  7432. cur = build_norm(inpL,
  7433. model.layers[il].attn_norm, NULL,
  7434. LLM_NORM_RMS, il);
  7435. cb(cur, "attn_norm", il);
  7436. // self-attention
  7437. {
  7438. // compute Q and K and RoPE them
  7439. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7440. cb(Qcur, "Qcur", il);
  7441. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7442. cb(Kcur, "Kcur", il);
  7443. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7444. cb(Vcur, "Vcur", il);
  7445. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7446. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7447. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7448. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  7449. cb(Qcur, "Qcur_normed", il);
  7450. Qcur = ggml_rope_ext(
  7451. ctx0, Qcur, inp_pos, nullptr,
  7452. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7453. ext_factor, attn_factor, beta_fast, beta_slow
  7454. );
  7455. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  7456. cb(Kcur, "Kcur_normed", il);
  7457. Kcur = ggml_rope_ext(
  7458. ctx0, Kcur, inp_pos, nullptr,
  7459. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7460. ext_factor, attn_factor, beta_fast, beta_slow
  7461. );
  7462. cb(Qcur, "Qcur", il);
  7463. cb(Kcur, "Kcur", il);
  7464. cb(Vcur, "Vcur", il);
  7465. cur = build_attn(inp_attn,
  7466. model.layers[il].wo, model.layers[il].bo,
  7467. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7468. }
  7469. if (il == n_layer - 1 && inp_out_ids) {
  7470. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7471. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7472. }
  7473. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7474. cb(ffn_inp, "ffn_inp", il);
  7475. // feed-forward network
  7476. cur = build_norm(ffn_inp,
  7477. model.layers[il].ffn_norm, NULL,
  7478. LLM_NORM_RMS, il);
  7479. cb(cur, "ffn_norm", il);
  7480. cur = build_ffn(cur,
  7481. model.layers[il].ffn_up, NULL, NULL,
  7482. model.layers[il].ffn_gate, NULL, NULL,
  7483. model.layers[il].ffn_down, NULL, NULL,
  7484. NULL,
  7485. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7486. cb(cur, "ffn_out", il);
  7487. cur = ggml_add(ctx0, cur, ffn_inp);
  7488. cur = build_cvec(cur, il);
  7489. cb(cur, "l_out", il);
  7490. // input for next layer
  7491. inpL = cur;
  7492. }
  7493. cur = inpL;
  7494. cur = build_norm(cur,
  7495. model.output_norm, NULL,
  7496. LLM_NORM_RMS, -1);
  7497. cb(cur, "result_norm", -1);
  7498. res->t_embd = cur;
  7499. // lm_head
  7500. cur = build_lora_mm(model.output, cur);
  7501. cb(cur, "result_output", -1);
  7502. res->t_logits = cur;
  7503. ggml_build_forward_expand(gf, cur);
  7504. }
  7505. };
  7506. struct llm_build_qwen3moe : public llm_graph_context {
  7507. llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7508. const int64_t n_embd_head = hparams.n_embd_head_v;
  7509. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7510. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7511. ggml_tensor * cur;
  7512. ggml_tensor * inpL;
  7513. inpL = build_inp_embd(model.tok_embd);
  7514. // inp_pos - contains the positions
  7515. ggml_tensor * inp_pos = build_inp_pos();
  7516. auto * inp_attn = build_attn_inp_kv();
  7517. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7518. for (int il = 0; il < n_layer; ++il) {
  7519. ggml_tensor * inpSA = inpL;
  7520. // norm
  7521. cur = build_norm(inpL,
  7522. model.layers[il].attn_norm, NULL,
  7523. LLM_NORM_RMS, il);
  7524. cb(cur, "attn_norm", il);
  7525. // self_attention
  7526. {
  7527. // compute Q and K and RoPE them
  7528. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7529. cb(Qcur, "Qcur", il);
  7530. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7531. cb(Kcur, "Kcur", il);
  7532. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7533. cb(Vcur, "Vcur", il);
  7534. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7535. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7536. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7537. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  7538. cb(Qcur, "Qcur_normed", il);
  7539. Qcur = ggml_rope_ext(
  7540. ctx0, Qcur, inp_pos, nullptr,
  7541. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7542. ext_factor, attn_factor, beta_fast, beta_slow
  7543. );
  7544. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  7545. cb(Kcur, "Kcur_normed", il);
  7546. Kcur = ggml_rope_ext(
  7547. ctx0, Kcur, inp_pos, nullptr,
  7548. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7549. ext_factor, attn_factor, beta_fast, beta_slow
  7550. );
  7551. cb(Qcur, "Qcur", il);
  7552. cb(Kcur, "Kcur", il);
  7553. cb(Vcur, "Vcur", il);
  7554. cur = build_attn(inp_attn,
  7555. model.layers[il].wo, model.layers[il].bo,
  7556. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7557. }
  7558. if (il == n_layer - 1 && inp_out_ids) {
  7559. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7560. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7561. }
  7562. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7563. cb(ffn_inp, "ffn_inp", il);
  7564. // MoE branch
  7565. cur = build_norm(ffn_inp,
  7566. model.layers[il].ffn_norm, NULL,
  7567. LLM_NORM_RMS, il);
  7568. cb(cur, "ffn_norm", il);
  7569. ggml_tensor * moe_out =
  7570. build_moe_ffn(cur,
  7571. model.layers[il].ffn_gate_inp,
  7572. model.layers[il].ffn_up_exps,
  7573. model.layers[il].ffn_gate_exps,
  7574. model.layers[il].ffn_down_exps,
  7575. nullptr,
  7576. n_expert, n_expert_used,
  7577. LLM_FFN_SILU, true,
  7578. false, 0.0,
  7579. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  7580. il);
  7581. cb(moe_out, "ffn_moe_out", il);
  7582. cur = moe_out;
  7583. cur = ggml_add(ctx0, cur, ffn_inp);
  7584. cur = build_cvec(cur, il);
  7585. cb(cur, "l_out", il);
  7586. // input for next layer
  7587. inpL = cur;
  7588. }
  7589. cur = inpL;
  7590. cur = build_norm(cur,
  7591. model.output_norm, NULL,
  7592. LLM_NORM_RMS, -1);
  7593. cb(cur, "result_norm", -1);
  7594. res->t_embd = cur;
  7595. // lm_head
  7596. cur = build_lora_mm(model.output, cur);
  7597. cb(cur, "result_output", -1);
  7598. res->t_logits = cur;
  7599. ggml_build_forward_expand(gf, cur);
  7600. }
  7601. };
  7602. struct llm_build_phi2 : public llm_graph_context {
  7603. llm_build_phi2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7604. const int64_t n_embd_head = hparams.n_embd_head_v;
  7605. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  7606. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7607. ggml_tensor * cur;
  7608. ggml_tensor * attn_norm_output;
  7609. ggml_tensor * ffn_output;
  7610. ggml_tensor * inpL;
  7611. inpL = build_inp_embd(model.tok_embd);
  7612. // inp_pos - contains the positions
  7613. ggml_tensor * inp_pos = build_inp_pos();
  7614. auto * inp_attn = build_attn_inp_kv();
  7615. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7616. for (int il = 0; il < n_layer; ++il) {
  7617. attn_norm_output = build_norm(inpL,
  7618. model.layers[il].attn_norm,
  7619. model.layers[il].attn_norm_b,
  7620. LLM_NORM, il);
  7621. cb(attn_norm_output, "attn_norm", il);
  7622. // self-attention
  7623. {
  7624. ggml_tensor * Qcur = nullptr;
  7625. ggml_tensor * Kcur = nullptr;
  7626. ggml_tensor * Vcur = nullptr;
  7627. if (model.layers[il].wqkv) {
  7628. cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
  7629. cb(cur, "wqkv", il);
  7630. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  7631. cb(cur, "bqkv", il);
  7632. Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  7633. Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  7634. Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  7635. } else {
  7636. Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
  7637. Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
  7638. Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
  7639. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7640. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7641. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7642. }
  7643. Qcur = ggml_rope_ext(
  7644. ctx0, Qcur, inp_pos, nullptr,
  7645. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7646. ext_factor, attn_factor, beta_fast, beta_slow
  7647. );
  7648. Kcur = ggml_rope_ext(
  7649. ctx0, Kcur, inp_pos, nullptr,
  7650. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7651. ext_factor, attn_factor, beta_fast, beta_slow
  7652. );
  7653. cb(Qcur, "Qcur", il);
  7654. cb(Kcur, "Kcur", il);
  7655. cb(Vcur, "Vcur", il);
  7656. // with phi2, we scale the Q to avoid precision issues
  7657. // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
  7658. Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
  7659. cur = build_attn(inp_attn,
  7660. model.layers[il].wo, model.layers[il].bo,
  7661. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
  7662. }
  7663. if (il == n_layer - 1 && inp_out_ids) {
  7664. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7665. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  7666. attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
  7667. }
  7668. // FF
  7669. {
  7670. ffn_output = build_ffn(attn_norm_output,
  7671. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  7672. NULL, NULL, NULL,
  7673. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  7674. NULL,
  7675. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  7676. cb(ffn_output, "ffn_out", il);
  7677. }
  7678. cur = ggml_add(ctx0, cur, ffn_output);
  7679. cur = ggml_add(ctx0, cur, inpL);
  7680. cur = build_cvec(cur, il);
  7681. cb(cur, "l_out", il);
  7682. // input for next layer
  7683. inpL = cur;
  7684. }
  7685. cur = build_norm(inpL,
  7686. model.output_norm,
  7687. model.output_norm_b,
  7688. LLM_NORM, -1);
  7689. cb(cur, "result_norm", -1);
  7690. res->t_embd = cur;
  7691. cur = build_lora_mm(model.output, cur);
  7692. cb(cur, "result_output_no_bias", -1);
  7693. cur = ggml_add(ctx0, cur, model.output_b);
  7694. cb(cur, "result_output", -1);
  7695. res->t_logits = cur;
  7696. ggml_build_forward_expand(gf, cur);
  7697. }
  7698. };
  7699. template<bool iswa>
  7700. struct llm_build_phi3 : public llm_graph_context {
  7701. llm_build_phi3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7702. const int64_t n_embd_head = hparams.n_embd_head_v;
  7703. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  7704. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7705. ggml_tensor * cur;
  7706. ggml_tensor * inpL;
  7707. inpL = build_inp_embd(model.tok_embd);
  7708. // inp_pos - contains the positions
  7709. ggml_tensor * inp_pos = build_inp_pos();
  7710. using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
  7711. inp_attn_type * inp_attn = nullptr;
  7712. if constexpr (iswa) {
  7713. inp_attn = build_attn_inp_kv_iswa();
  7714. } else {
  7715. inp_attn = build_attn_inp_kv();
  7716. }
  7717. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7718. for (int il = 0; il < n_layer; ++il) {
  7719. auto * residual = inpL;
  7720. // self-attention
  7721. {
  7722. // rope freq factors for 128k context
  7723. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  7724. ggml_tensor* attn_norm_output = build_norm(inpL,
  7725. model.layers[il].attn_norm,
  7726. model.layers[il].attn_norm_b,
  7727. LLM_NORM_RMS, il);
  7728. cb(attn_norm_output, "attn_norm", il);
  7729. ggml_tensor * Qcur = nullptr;
  7730. ggml_tensor * Kcur = nullptr;
  7731. ggml_tensor * Vcur = nullptr;
  7732. if (model.layers[il].wqkv) {
  7733. cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
  7734. cb(cur, "wqkv", il);
  7735. Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
  7736. Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
  7737. Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
  7738. } else {
  7739. Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
  7740. Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
  7741. Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
  7742. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7743. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7744. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7745. }
  7746. Qcur = ggml_rope_ext(
  7747. ctx0, Qcur, inp_pos, rope_factors,
  7748. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7749. ext_factor, attn_factor, beta_fast, beta_slow
  7750. );
  7751. Kcur = ggml_rope_ext(
  7752. ctx0, Kcur, inp_pos, rope_factors,
  7753. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7754. ext_factor, attn_factor, beta_fast, beta_slow
  7755. );
  7756. cb(Qcur, "Qcur", il);
  7757. cb(Kcur, "Kcur", il);
  7758. cb(Vcur, "Vcur", il);
  7759. Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
  7760. cb(Qcur, "Qcur", il);
  7761. cur = build_attn(inp_attn,
  7762. model.layers[il].wo, model.layers[il].bo,
  7763. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
  7764. }
  7765. if (il == n_layer - 1 && inp_out_ids) {
  7766. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7767. residual = ggml_get_rows(ctx0, residual, inp_out_ids);
  7768. }
  7769. cur = ggml_add(ctx0, cur, residual);
  7770. residual = cur;
  7771. cur = build_norm(cur,
  7772. model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
  7773. LLM_NORM_RMS, il);
  7774. cb(cur, "ffn_norm", il);
  7775. // feed-forward network
  7776. if (model.layers[il].ffn_gate_inp == nullptr) {
  7777. cur = build_ffn(cur,
  7778. model.layers[il].ffn_up, NULL, NULL,
  7779. NULL, NULL, NULL,
  7780. model.layers[il].ffn_down, NULL, NULL,
  7781. NULL,
  7782. LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
  7783. cb(cur, "ffn_out", il);
  7784. } else {
  7785. // MoE branch
  7786. cur = build_moe_ffn(cur,
  7787. model.layers[il].ffn_gate_inp,
  7788. model.layers[il].ffn_up_exps,
  7789. model.layers[il].ffn_gate_exps,
  7790. model.layers[il].ffn_down_exps,
  7791. nullptr,
  7792. n_expert, n_expert_used,
  7793. LLM_FFN_SILU, true,
  7794. false, 0.0,
  7795. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  7796. il);
  7797. cb(cur, "ffn_moe_out", il);
  7798. }
  7799. cur = ggml_add(ctx0, residual, cur);
  7800. cur = build_cvec(cur, il);
  7801. cb(cur, "l_out", il);
  7802. // input for next layer
  7803. inpL = cur;
  7804. }
  7805. cur = build_norm(inpL,
  7806. model.output_norm,
  7807. model.output_norm_b,
  7808. LLM_NORM_RMS, -1);
  7809. cb(cur, "result_norm", -1);
  7810. res->t_embd = cur;
  7811. cur = build_lora_mm(model.output, cur);
  7812. if (model.output_b != nullptr) {
  7813. cb(cur, "result_output_no_bias", -1);
  7814. cur = ggml_add(ctx0, cur, model.output_b);
  7815. }
  7816. cb(cur, "result_output", -1);
  7817. res->t_logits = cur;
  7818. ggml_build_forward_expand(gf, cur);
  7819. }
  7820. };
  7821. struct llm_build_plamo : public llm_graph_context {
  7822. llm_build_plamo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7823. const int64_t n_embd_head = hparams.n_embd_head_v;
  7824. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7825. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7826. ggml_tensor * cur;
  7827. ggml_tensor * inpL;
  7828. inpL = build_inp_embd(model.tok_embd);
  7829. // inp_pos - contains the positions
  7830. ggml_tensor * inp_pos = build_inp_pos();
  7831. auto * inp_attn = build_attn_inp_kv();
  7832. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7833. for (int il = 0; il < n_layer; ++il) {
  7834. // norm
  7835. cur = build_norm(inpL,
  7836. model.layers[il].attn_norm, NULL,
  7837. LLM_NORM_RMS, il);
  7838. cb(cur, "attn_norm", il);
  7839. ggml_tensor * sa_inp = cur;
  7840. // self-attention
  7841. {
  7842. // compute Q and K and RoPE them
  7843. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7844. cb(Qcur, "Qcur", il);
  7845. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7846. cb(Kcur, "Kcur", il);
  7847. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7848. cb(Vcur, "Vcur", il);
  7849. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7850. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7851. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7852. Qcur = ggml_rope_ext(
  7853. ctx0, Qcur, inp_pos, nullptr,
  7854. n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
  7855. ext_factor, attn_factor, beta_fast, beta_slow
  7856. );
  7857. Kcur = ggml_rope_ext(
  7858. ctx0, Kcur, inp_pos, nullptr,
  7859. n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
  7860. ext_factor, attn_factor, beta_fast, beta_slow
  7861. );
  7862. cb(Qcur, "Qcur", il);
  7863. cb(Kcur, "Kcur", il);
  7864. cb(Vcur, "Vcur", il);
  7865. cur = build_attn(inp_attn,
  7866. model.layers[il].wo, NULL,
  7867. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7868. }
  7869. if (il == n_layer - 1 && inp_out_ids) {
  7870. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7871. sa_inp = ggml_get_rows(ctx0, sa_inp, inp_out_ids);
  7872. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  7873. }
  7874. ggml_tensor * sa_out = cur;
  7875. cur = sa_inp;
  7876. // feed-forward network
  7877. {
  7878. cur = build_ffn(cur,
  7879. model.layers[il].ffn_up, NULL, NULL,
  7880. model.layers[il].ffn_gate, NULL, NULL,
  7881. model.layers[il].ffn_down, NULL, NULL,
  7882. NULL,
  7883. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7884. cb(cur, "ffn_out", il);
  7885. }
  7886. cur = ggml_add(ctx0, cur, sa_out);
  7887. cur = ggml_add(ctx0, cur, inpL);
  7888. cur = build_cvec(cur, il);
  7889. cb(cur, "l_out", il);
  7890. // input for next layer
  7891. inpL = cur;
  7892. }
  7893. cur = inpL;
  7894. cur = build_norm(cur,
  7895. model.output_norm, NULL,
  7896. LLM_NORM_RMS, -1);
  7897. cb(cur, "result_norm", -1);
  7898. res->t_embd = cur;
  7899. // lm_head
  7900. cur = build_lora_mm(model.output, cur);
  7901. cb(cur, "result_output", -1);
  7902. res->t_logits = cur;
  7903. ggml_build_forward_expand(gf, cur);
  7904. }
  7905. };
  7906. struct llm_build_gpt2 : public llm_graph_context {
  7907. llm_build_gpt2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7908. const int64_t n_embd_head = hparams.n_embd_head_v;
  7909. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  7910. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7911. ggml_tensor * cur;
  7912. ggml_tensor * pos;
  7913. ggml_tensor * inpL;
  7914. inpL = build_inp_embd(model.tok_embd);
  7915. // inp_pos - contains the positions
  7916. ggml_tensor * inp_pos = build_inp_pos();
  7917. auto * inp_attn = build_attn_inp_kv();
  7918. pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
  7919. cb(pos, "pos_embd", -1);
  7920. inpL = ggml_add(ctx0, inpL, pos);
  7921. cb(inpL, "inpL", -1);
  7922. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7923. for (int il = 0; il < n_layer; ++il) {
  7924. cur = build_norm(inpL,
  7925. model.layers[il].attn_norm,
  7926. model.layers[il].attn_norm_b,
  7927. LLM_NORM, il);
  7928. cb(cur, "attn_norm", il);
  7929. // self-attention
  7930. {
  7931. cur = build_lora_mm(model.layers[il].wqkv, cur);
  7932. cb(cur, "wqkv", il);
  7933. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  7934. cb(cur, "bqkv", il);
  7935. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  7936. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  7937. ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  7938. cb(Qcur, "Qcur", il);
  7939. cb(Kcur, "Kcur", il);
  7940. cb(Vcur, "Vcur", il);
  7941. cur = build_attn(inp_attn,
  7942. model.layers[il].wo, model.layers[il].bo,
  7943. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7944. }
  7945. if (il == n_layer - 1 && inp_out_ids) {
  7946. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7947. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  7948. }
  7949. // add the input
  7950. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  7951. cb(ffn_inp, "ffn_inp", il);
  7952. // FF
  7953. {
  7954. cur = build_norm(ffn_inp,
  7955. model.layers[il].ffn_norm,
  7956. model.layers[il].ffn_norm_b,
  7957. LLM_NORM, il);
  7958. cb(cur, "ffn_norm", il);
  7959. cur = build_ffn(cur,
  7960. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  7961. NULL, NULL, NULL,
  7962. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  7963. NULL,
  7964. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  7965. cb(cur, "ffn_out", il);
  7966. }
  7967. cur = ggml_add(ctx0, cur, ffn_inp);
  7968. cur = build_cvec(cur, il);
  7969. cb(cur, "l_out", il);
  7970. // input for next layer
  7971. inpL = cur;
  7972. }
  7973. cur = build_norm(inpL,
  7974. model.output_norm,
  7975. model.output_norm_b,
  7976. LLM_NORM, -1);
  7977. cb(cur, "result_norm", -1);
  7978. res->t_embd = cur;
  7979. cur = build_lora_mm(model.output, cur);
  7980. cb(cur, "result_output", -1);
  7981. res->t_logits = cur;
  7982. ggml_build_forward_expand(gf, cur);
  7983. }
  7984. };
  7985. struct llm_build_codeshell : public llm_graph_context {
  7986. llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7987. const int64_t n_embd_head = hparams.n_embd_head_v;
  7988. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  7989. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7990. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7991. ggml_tensor * cur;
  7992. ggml_tensor * inpL;
  7993. inpL = build_inp_embd(model.tok_embd);
  7994. // inp_pos - contains the positions
  7995. ggml_tensor * inp_pos = build_inp_pos();
  7996. auto * inp_attn = build_attn_inp_kv();
  7997. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7998. for (int il = 0; il < n_layer; ++il) {
  7999. cur = build_norm(inpL,
  8000. model.layers[il].attn_norm,
  8001. model.layers[il].attn_norm_b,
  8002. LLM_NORM, il);
  8003. cb(cur, "attn_norm", il);
  8004. // self-attention
  8005. {
  8006. cur = build_lora_mm(model.layers[il].wqkv, cur);
  8007. cb(cur, "wqkv", il);
  8008. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  8009. cb(cur, "bqkv", il);
  8010. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  8011. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  8012. ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  8013. Qcur = ggml_rope_ext(
  8014. ctx0, Qcur, inp_pos, nullptr,
  8015. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8016. ext_factor, attn_factor, beta_fast, beta_slow
  8017. );
  8018. Kcur = ggml_rope_ext(
  8019. ctx0, Kcur, inp_pos, nullptr,
  8020. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8021. ext_factor, attn_factor, beta_fast, beta_slow
  8022. );
  8023. cb(Qcur, "Qcur", il);
  8024. cb(Kcur, "Kcur", il);
  8025. cb(Vcur, "Vcur", il);
  8026. cur = build_attn(inp_attn,
  8027. model.layers[il].wo, model.layers[il].bo,
  8028. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  8029. }
  8030. if (il == n_layer - 1 && inp_out_ids) {
  8031. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8032. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  8033. }
  8034. // add the input
  8035. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  8036. cb(ffn_inp, "ffn_inp", il);
  8037. // FF
  8038. {
  8039. cur = build_norm(ffn_inp,
  8040. model.layers[il].ffn_norm,
  8041. model.layers[il].ffn_norm_b,
  8042. LLM_NORM, il);
  8043. cb(cur, "ffn_norm", il);
  8044. cur = build_ffn(cur,
  8045. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  8046. NULL, NULL, NULL,
  8047. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  8048. NULL,
  8049. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  8050. cb(cur, "ffn_out", il);
  8051. }
  8052. cur = ggml_add(ctx0, cur, ffn_inp);
  8053. cur = build_cvec(cur, il);
  8054. cb(cur, "l_out", il);
  8055. // input for next layer
  8056. inpL = cur;
  8057. }
  8058. cur = build_norm(inpL,
  8059. model.output_norm,
  8060. model.output_norm_b,
  8061. LLM_NORM, -1);
  8062. cb(cur, "result_norm", -1);
  8063. res->t_embd = cur;
  8064. cur = build_lora_mm(model.output, cur);
  8065. cb(cur, "result_output", -1);
  8066. res->t_logits = cur;
  8067. ggml_build_forward_expand(gf, cur);
  8068. }
  8069. };
  8070. struct llm_build_orion : public llm_graph_context {
  8071. llm_build_orion(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  8072. const int64_t n_embd_head = hparams.n_embd_head_v;
  8073. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8074. GGML_ASSERT(n_embd_head == hparams.n_rot);
  8075. ggml_tensor * cur;
  8076. ggml_tensor * inpL;
  8077. inpL = build_inp_embd(model.tok_embd);
  8078. // inp_pos - contains the positions
  8079. ggml_tensor * inp_pos = build_inp_pos();
  8080. auto * inp_attn = build_attn_inp_kv();
  8081. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8082. for (int il = 0; il < n_layer; ++il) {
  8083. ggml_tensor * inpSA = inpL;
  8084. // norm
  8085. cur = build_norm(inpL,
  8086. model.layers[il].attn_norm, model.layers[il].attn_norm_b,
  8087. LLM_NORM, il);
  8088. cb(cur, "attn_norm", il);
  8089. // self-attention
  8090. {
  8091. // compute Q and K and RoPE them
  8092. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8093. cb(Qcur, "Qcur", il);
  8094. // if (model.layers[il].bq) {
  8095. // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  8096. // cb(Qcur, "Qcur", il);
  8097. // }
  8098. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8099. cb(Kcur, "Kcur", il);
  8100. // if (model.layers[il].bk) {
  8101. // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  8102. // cb(Kcur, "Kcur", il);
  8103. // }
  8104. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8105. cb(Vcur, "Vcur", il);
  8106. // if (model.layers[il].bv) {
  8107. // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  8108. // cb(Vcur, "Vcur", il);
  8109. // }
  8110. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8111. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8112. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8113. Qcur = ggml_rope_ext(
  8114. ctx0, Qcur, inp_pos, nullptr,
  8115. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8116. ext_factor, attn_factor, beta_fast, beta_slow
  8117. );
  8118. Kcur = ggml_rope_ext(
  8119. ctx0, Kcur, inp_pos, nullptr,
  8120. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8121. ext_factor, attn_factor, beta_fast, beta_slow
  8122. );
  8123. cb(Qcur, "Qcur", il);
  8124. cb(Kcur, "Kcur", il);
  8125. cb(Vcur, "Vcur", il);
  8126. cur = build_attn(inp_attn,
  8127. model.layers[il].wo, NULL,
  8128. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  8129. }
  8130. if (il == n_layer - 1 && inp_out_ids) {
  8131. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8132. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  8133. }
  8134. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  8135. cb(ffn_inp, "ffn_inp", il);
  8136. // feed-forward network
  8137. cur = build_norm(ffn_inp,
  8138. model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
  8139. LLM_NORM, il);
  8140. cb(cur, "ffn_norm", il);
  8141. cur = build_ffn(cur,
  8142. model.layers[il].ffn_up, NULL, NULL,
  8143. model.layers[il].ffn_gate, NULL, NULL,
  8144. model.layers[il].ffn_down, NULL, NULL,
  8145. NULL,
  8146. LLM_FFN_SILU, LLM_FFN_PAR, il);
  8147. cb(cur, "ffn_out", il);
  8148. cur = ggml_add(ctx0, cur, ffn_inp);
  8149. cur = build_cvec(cur, il);
  8150. cb(cur, "l_out", il);
  8151. // input for next layer
  8152. inpL = cur;
  8153. }
  8154. cur = inpL;
  8155. cur = build_norm(cur,
  8156. model.output_norm, model.output_norm_b,
  8157. LLM_NORM, -1);
  8158. cb(cur, "result_norm", -1);
  8159. res->t_embd = cur;
  8160. // lm_head
  8161. cur = build_lora_mm(model.output, cur);
  8162. cb(cur, "result_output", -1);
  8163. res->t_logits = cur;
  8164. ggml_build_forward_expand(gf, cur);
  8165. }
  8166. };
  8167. struct llm_build_internlm2 : public llm_graph_context {
  8168. llm_build_internlm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  8169. const int64_t n_embd_head = hparams.n_embd_head_v;
  8170. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8171. GGML_ASSERT(n_embd_head == hparams.n_rot);
  8172. ggml_tensor * cur;
  8173. ggml_tensor * inpL;
  8174. inpL = build_inp_embd(model.tok_embd);
  8175. // inp_pos - contains the positions
  8176. ggml_tensor * inp_pos = build_inp_pos();
  8177. auto * inp_attn = build_attn_inp_kv();
  8178. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8179. for (int il = 0; il < n_layer; ++il) {
  8180. ggml_tensor * inpSA = inpL;
  8181. // norm
  8182. cur = build_norm(inpL,
  8183. model.layers[il].attn_norm, NULL,
  8184. LLM_NORM_RMS, il);
  8185. cb(cur, "attn_norm", il);
  8186. // self-attention
  8187. {
  8188. // compute Q and K and RoPE them
  8189. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8190. cb(Qcur, "Qcur", il);
  8191. if (model.layers[il].bq) {
  8192. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  8193. cb(Qcur, "Qcur", il);
  8194. }
  8195. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8196. cb(Kcur, "Kcur", il);
  8197. if (model.layers[il].bk) {
  8198. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  8199. cb(Kcur, "Kcur", il);
  8200. }
  8201. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8202. cb(Vcur, "Vcur", il);
  8203. if (model.layers[il].bv) {
  8204. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  8205. cb(Vcur, "Vcur", il);
  8206. }
  8207. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8208. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8209. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8210. Qcur = ggml_rope_ext(
  8211. ctx0, Qcur, inp_pos, nullptr,
  8212. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8213. ext_factor, attn_factor, beta_fast, beta_slow
  8214. );
  8215. Kcur = ggml_rope_ext(
  8216. ctx0, Kcur, inp_pos, nullptr,
  8217. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8218. ext_factor, attn_factor, beta_fast, beta_slow
  8219. );
  8220. cb(Qcur, "Qcur", il);
  8221. cb(Kcur, "Kcur", il);
  8222. cb(Vcur, "Vcur", il);
  8223. cur = build_attn(inp_attn,
  8224. model.layers[il].wo, model.layers[il].bo,
  8225. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  8226. }
  8227. if (il == n_layer - 1 && inp_out_ids) {
  8228. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8229. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  8230. }
  8231. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  8232. cb(ffn_inp, "ffn_inp", il);
  8233. // feed-forward network
  8234. cur = build_norm(ffn_inp,
  8235. model.layers[il].ffn_norm, NULL,
  8236. LLM_NORM_RMS, il);
  8237. cb(cur, "ffn_norm", il);
  8238. cur = build_ffn(cur,
  8239. model.layers[il].ffn_up, NULL, NULL,
  8240. model.layers[il].ffn_gate, NULL, NULL,
  8241. model.layers[il].ffn_down, NULL, NULL,
  8242. NULL,
  8243. LLM_FFN_SILU, LLM_FFN_PAR, il);
  8244. cb(cur, "ffn_out", il);
  8245. cur = ggml_add(ctx0, cur, ffn_inp);
  8246. cur = build_cvec(cur, il);
  8247. cb(cur, "l_out", il);
  8248. // input for next layer
  8249. inpL = cur;
  8250. }
  8251. cur = inpL;
  8252. cur = build_norm(cur,
  8253. model.output_norm, NULL,
  8254. LLM_NORM_RMS, -1);
  8255. cb(cur, "result_norm", -1);
  8256. res->t_embd = cur;
  8257. // lm_head
  8258. cur = build_lora_mm(model.output, cur);
  8259. cb(cur, "result_output", -1);
  8260. res->t_logits = cur;
  8261. ggml_build_forward_expand(gf, cur);
  8262. }
  8263. };
  8264. struct llm_build_minicpm3 : public llm_graph_context {
  8265. llm_build_minicpm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  8266. //TODO: if the model varies, these parameters need to be read from the model
  8267. const int64_t n_embd_base = 256;
  8268. const float scale_embd = 12.0f;
  8269. const float scale_depth = 1.4f;
  8270. const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
  8271. const uint32_t n_embd_head_qk_rope = hparams.n_rot;
  8272. const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  8273. const uint32_t kv_lora_rank = hparams.n_lora_kv;
  8274. ggml_tensor * cur;
  8275. ggml_tensor * inpL;
  8276. inpL = build_inp_embd(model.tok_embd);
  8277. // scale the input embeddings
  8278. inpL = ggml_scale(ctx0, inpL, scale_embd);
  8279. cb(inpL, "inp_scaled", -1);
  8280. // inp_pos - contains the positions
  8281. ggml_tensor * inp_pos = build_inp_pos();
  8282. auto * inp_attn = build_attn_inp_kv();
  8283. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8284. for (int il = 0; il < n_layer; ++il) {
  8285. ggml_tensor * inpSA = inpL;
  8286. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  8287. // norm
  8288. cur = build_norm(inpL,
  8289. model.layers[il].attn_norm, NULL,
  8290. LLM_NORM_RMS, il);
  8291. cb(cur, "attn_norm", il);
  8292. // self_attention
  8293. {
  8294. ggml_tensor * q = NULL;
  8295. // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
  8296. q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
  8297. cb(q, "q", il);
  8298. q = build_norm(q,
  8299. model.layers[il].attn_q_a_norm, NULL,
  8300. LLM_NORM_RMS, il);
  8301. cb(q, "q", il);
  8302. // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
  8303. q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
  8304. cb(q, "q", il);
  8305. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  8306. ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
  8307. ggml_row_size(q->type, hparams.n_embd_head_k),
  8308. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  8309. 0);
  8310. cb(q_nope, "q_nope", il);
  8311. // and {n_head * n_embd_head_qk_rope, n_tokens}
  8312. ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
  8313. ggml_row_size(q->type, hparams.n_embd_head_k),
  8314. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  8315. ggml_row_size(q->type, n_embd_head_qk_nope));
  8316. cb(q_pe, "q_pe", il);
  8317. // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
  8318. ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
  8319. cb(kv_pe_compresseed, "kv_pe_compresseed", il);
  8320. // split into {kv_lora_rank, n_tokens}
  8321. ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
  8322. kv_pe_compresseed->nb[1],
  8323. 0);
  8324. cb(kv_compressed, "kv_compressed", il);
  8325. // and {n_embd_head_qk_rope, n_tokens}
  8326. ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
  8327. kv_pe_compresseed->nb[1],
  8328. kv_pe_compresseed->nb[1],
  8329. ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
  8330. cb(k_pe, "k_pe", il);
  8331. kv_compressed = build_norm(kv_compressed,
  8332. model.layers[il].attn_kv_a_norm, NULL,
  8333. LLM_NORM_RMS, il);
  8334. cb(kv_compressed, "kv_compressed", il);
  8335. // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
  8336. ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
  8337. cb(kv, "kv", il);
  8338. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  8339. ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
  8340. ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
  8341. ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  8342. 0);
  8343. cb(k_nope, "k_nope", il);
  8344. // and {n_head * n_embd_head_v, n_tokens}
  8345. ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
  8346. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  8347. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
  8348. ggml_row_size(kv->type, (n_embd_head_qk_nope)));
  8349. cb(v_states, "v_states", il);
  8350. v_states = ggml_cont(ctx0, v_states);
  8351. cb(v_states, "v_states", il);
  8352. q_pe = ggml_rope_ext(
  8353. ctx0, q_pe, inp_pos, rope_factors,
  8354. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8355. ext_factor, attn_factor, beta_fast, beta_slow
  8356. );
  8357. cb(q_pe, "q_pe", il);
  8358. // shared RoPE key
  8359. k_pe = ggml_rope_ext(
  8360. ctx0, k_pe, inp_pos, rope_factors,
  8361. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8362. ext_factor, attn_factor, beta_fast, beta_slow
  8363. );
  8364. cb(k_pe, "k_pe", il);
  8365. ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
  8366. cb(q_states, "q_states", il);
  8367. ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
  8368. cb(k_states, "k_states", il);
  8369. cur = build_attn(inp_attn,
  8370. model.layers[il].wo, NULL,
  8371. q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
  8372. }
  8373. if (il == n_layer - 1 && inp_out_ids) {
  8374. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8375. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  8376. }
  8377. // scale_res - scale the hidden states for residual connection
  8378. const float scale_res = scale_depth/sqrtf(float(n_layer)); // TODO: is this correct?
  8379. cur = ggml_scale(ctx0, cur, scale_res);
  8380. cb(cur, "hidden_scaled", il);
  8381. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  8382. cb(ffn_inp, "ffn_inp", il);
  8383. // feed-forward network
  8384. {
  8385. cur = build_norm(ffn_inp,
  8386. model.layers[il].ffn_norm, NULL,
  8387. LLM_NORM_RMS, il);
  8388. cb(cur, "ffn_norm", il);
  8389. cur = build_ffn(cur,
  8390. model.layers[il].ffn_up, NULL, NULL,
  8391. model.layers[il].ffn_gate, NULL, NULL,
  8392. model.layers[il].ffn_down, NULL, NULL,
  8393. NULL,
  8394. LLM_FFN_SILU, LLM_FFN_PAR, il);
  8395. cb(cur, "ffn_out", il);
  8396. }
  8397. // scale the hidden states for residual connection
  8398. cur = ggml_scale(ctx0, cur, scale_res);
  8399. cb(cur, "hidden_scaled_ffn", il);
  8400. cur = ggml_add(ctx0, cur, ffn_inp);
  8401. cur = build_cvec(cur, il);
  8402. cb(cur, "l_out", il);
  8403. // input for next layer
  8404. inpL = cur;
  8405. }
  8406. cur = inpL;
  8407. cur = build_norm(cur,
  8408. model.output_norm, NULL,
  8409. LLM_NORM_RMS, -1);
  8410. cb(cur, "result_norm", -1);
  8411. res->t_embd = cur;
  8412. // lm_head scaling
  8413. const float scale_lmhead = float(n_embd_base)/float(n_embd);
  8414. cur = ggml_scale(ctx0, cur, scale_lmhead);
  8415. cb(cur, "lmhead_scaling", -1);
  8416. // lm_head
  8417. cur = build_lora_mm(model.output, cur);
  8418. cb(cur, "result_output", -1);
  8419. res->t_logits = cur;
  8420. ggml_build_forward_expand(gf, cur);
  8421. }
  8422. };
  8423. struct llm_build_gemma : public llm_graph_context {
  8424. llm_build_gemma(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  8425. const int64_t n_embd_head = hparams.n_embd_head_v;
  8426. ggml_tensor * cur;
  8427. ggml_tensor * inpL;
  8428. inpL = build_inp_embd(model.tok_embd);
  8429. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  8430. cb(inpL, "inp_scaled", -1);
  8431. // inp_pos - contains the positions
  8432. ggml_tensor * inp_pos = build_inp_pos();
  8433. auto * inp_attn = build_attn_inp_kv();
  8434. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8435. for (int il = 0; il < n_layer; ++il) {
  8436. // norm
  8437. cur = build_norm(inpL,
  8438. model.layers[il].attn_norm, NULL,
  8439. LLM_NORM_RMS, il);
  8440. cb(cur, "attn_norm", il);
  8441. // self-attention
  8442. {
  8443. // compute Q and K and RoPE them
  8444. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8445. cb(Qcur, "Qcur", il);
  8446. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8447. cb(Kcur, "Kcur", il);
  8448. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8449. cb(Vcur, "Vcur", il);
  8450. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8451. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8452. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8453. Qcur = ggml_rope_ext(
  8454. ctx0, Qcur, inp_pos, nullptr,
  8455. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8456. ext_factor, attn_factor, beta_fast, beta_slow);
  8457. Kcur = ggml_rope_ext(
  8458. ctx0, Kcur, inp_pos, nullptr,
  8459. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8460. ext_factor, attn_factor, beta_fast, beta_slow);
  8461. cb(Qcur, "Qcur", il);
  8462. cb(Kcur, "Kcur", il);
  8463. cb(Vcur, "Vcur", il);
  8464. Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
  8465. cb(Qcur, "Qcur_scaled", il);
  8466. cur = build_attn(inp_attn,
  8467. model.layers[il].wo, NULL,
  8468. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
  8469. }
  8470. if (il == n_layer - 1 && inp_out_ids) {
  8471. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8472. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  8473. }
  8474. ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
  8475. cb(sa_out, "sa_out", il);
  8476. cur = build_norm(sa_out,
  8477. model.layers[il].ffn_norm, NULL,
  8478. LLM_NORM_RMS, il);
  8479. cb(cur, "ffn_norm", il);
  8480. // feed-forward network
  8481. {
  8482. cur = build_ffn(cur,
  8483. model.layers[il].ffn_up, NULL, NULL,
  8484. model.layers[il].ffn_gate, NULL, NULL,
  8485. model.layers[il].ffn_down, NULL, NULL,
  8486. NULL,
  8487. LLM_FFN_GELU, LLM_FFN_PAR, il);
  8488. cb(cur, "ffn_out", il);
  8489. }
  8490. cur = ggml_add(ctx0, cur, sa_out);
  8491. cur = build_cvec(cur, il);
  8492. cb(cur, "l_out", il);
  8493. // input for next layer
  8494. inpL = cur;
  8495. }
  8496. cur = inpL;
  8497. cur = build_norm(cur,
  8498. model.output_norm, NULL,
  8499. LLM_NORM_RMS, -1);
  8500. cb(cur, "result_norm", -1);
  8501. res->t_embd = cur;
  8502. // lm_head
  8503. cur = build_lora_mm(model.output, cur);
  8504. cb(cur, "result_output", -1);
  8505. res->t_logits = cur;
  8506. ggml_build_forward_expand(gf, cur);
  8507. }
  8508. };
  8509. struct llm_build_gemma2_iswa : public llm_graph_context {
  8510. llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  8511. const int64_t n_embd_head = hparams.n_embd_head_k;
  8512. ggml_tensor * cur;
  8513. ggml_tensor * inpL;
  8514. inpL = build_inp_embd(model.tok_embd);
  8515. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  8516. cb(inpL, "inp_scaled", -1);
  8517. // inp_pos - contains the positions
  8518. ggml_tensor * inp_pos = build_inp_pos();
  8519. auto * inp_attn = build_attn_inp_kv_iswa();
  8520. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8521. for (int il = 0; il < n_layer; ++il) {
  8522. // norm
  8523. cur = build_norm(inpL,
  8524. model.layers[il].attn_norm, NULL,
  8525. LLM_NORM_RMS, il);
  8526. cb(cur, "attn_norm", il);
  8527. // self-attention
  8528. {
  8529. // compute Q and K and RoPE them
  8530. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8531. cb(Qcur, "Qcur", il);
  8532. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8533. cb(Kcur, "Kcur", il);
  8534. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8535. cb(Vcur, "Vcur", il);
  8536. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8537. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8538. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8539. Qcur = ggml_rope_ext(
  8540. ctx0, Qcur, inp_pos, nullptr,
  8541. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8542. ext_factor, attn_factor, beta_fast, beta_slow);
  8543. Kcur = ggml_rope_ext(
  8544. ctx0, Kcur, inp_pos, nullptr,
  8545. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8546. ext_factor, attn_factor, beta_fast, beta_slow);
  8547. cb(Qcur, "Qcur", il);
  8548. cb(Kcur, "Kcur", il);
  8549. cb(Vcur, "Vcur", il);
  8550. Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
  8551. cur = build_attn(inp_attn,
  8552. model.layers[il].wo, NULL,
  8553. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
  8554. }
  8555. if (il == n_layer - 1 && inp_out_ids) {
  8556. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8557. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  8558. }
  8559. cur = build_norm(cur,
  8560. model.layers[il].attn_post_norm, NULL,
  8561. LLM_NORM_RMS, il);
  8562. cb(cur, "attn_post_norm", il);
  8563. ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
  8564. cb(sa_out, "sa_out", il);
  8565. cur = build_norm(sa_out,
  8566. model.layers[il].ffn_norm, NULL,
  8567. LLM_NORM_RMS, il);
  8568. cb(cur, "ffn_norm", il);
  8569. // feed-forward network
  8570. {
  8571. cur = build_ffn(cur,
  8572. model.layers[il].ffn_up, NULL, NULL,
  8573. model.layers[il].ffn_gate, NULL, NULL,
  8574. model.layers[il].ffn_down, NULL, NULL,
  8575. NULL,
  8576. LLM_FFN_GELU, LLM_FFN_PAR, il);
  8577. cb(cur, "ffn_out", il);
  8578. }
  8579. cur = build_norm(cur,
  8580. model.layers[il].ffn_post_norm, NULL,
  8581. LLM_NORM_RMS, -1);
  8582. cb(cur, "ffn_post_norm", -1);
  8583. cur = ggml_add(ctx0, cur, sa_out);
  8584. cur = build_cvec(cur, il);
  8585. cb(cur, "l_out", il);
  8586. // input for next layer
  8587. inpL = cur;
  8588. }
  8589. cur = inpL;
  8590. cur = build_norm(cur,
  8591. model.output_norm, NULL,
  8592. LLM_NORM_RMS, -1);
  8593. cb(cur, "result_norm", -1);
  8594. res->t_embd = cur;
  8595. // lm_head
  8596. cur = build_lora_mm(model.output, cur);
  8597. // final logit soft-capping
  8598. cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
  8599. cur = ggml_tanh(ctx0, cur);
  8600. cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
  8601. cb(cur, "result_output", -1);
  8602. res->t_logits = cur;
  8603. ggml_build_forward_expand(gf, cur);
  8604. }
  8605. };
  8606. struct llm_build_gemma3_iswa : public llm_graph_context {
  8607. llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  8608. const int64_t n_embd_head = hparams.n_embd_head_k;
  8609. ggml_tensor * cur;
  8610. ggml_tensor * inpL;
  8611. inpL = build_inp_embd(model.tok_embd);
  8612. // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
  8613. if (ubatch.token) {
  8614. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  8615. cb(inpL, "inp_scaled", -1);
  8616. }
  8617. // inp_pos - contains the positions
  8618. ggml_tensor * inp_pos = build_inp_pos();
  8619. // TODO: is causal == true correct? might need some changes
  8620. auto * inp_attn = build_attn_inp_kv_iswa();
  8621. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8622. for (int il = 0; il < n_layer; ++il) {
  8623. const float freq_base_l = model.get_rope_freq_base (cparams, il);
  8624. const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
  8625. // norm
  8626. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  8627. cb(cur, "attn_norm", il);
  8628. // self-attention
  8629. {
  8630. // compute Q and K and RoPE them
  8631. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8632. cb(Qcur, "Qcur", il);
  8633. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8634. cb(Kcur, "Kcur", il);
  8635. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8636. cb(Vcur, "Vcur", il);
  8637. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8638. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8639. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8640. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  8641. cb(Qcur, "Qcur_normed", il);
  8642. Qcur = ggml_rope_ext(
  8643. ctx0, Qcur, inp_pos, nullptr,
  8644. n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
  8645. ext_factor, attn_factor, beta_fast, beta_slow);
  8646. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  8647. cb(Kcur, "Kcur_normed", il);
  8648. Kcur = ggml_rope_ext(
  8649. ctx0, Kcur, inp_pos, nullptr,
  8650. n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
  8651. ext_factor, attn_factor, beta_fast, beta_slow);
  8652. cb(Qcur, "Qcur", il);
  8653. cb(Kcur, "Kcur", il);
  8654. cb(Vcur, "Vcur", il);
  8655. // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
  8656. Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
  8657. cur = build_attn(inp_attn,
  8658. model.layers[il].wo, NULL,
  8659. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
  8660. }
  8661. if (il == n_layer - 1 && inp_out_ids) {
  8662. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8663. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  8664. }
  8665. cur = build_norm(cur,
  8666. model.layers[il].attn_post_norm, NULL,
  8667. LLM_NORM_RMS, il);
  8668. cb(cur, "attn_post_norm", il);
  8669. ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
  8670. cb(sa_out, "sa_out", il);
  8671. cur = build_norm(sa_out,
  8672. model.layers[il].ffn_norm, NULL,
  8673. LLM_NORM_RMS, il);
  8674. cb(cur, "ffn_norm", il);
  8675. // feed-forward network
  8676. {
  8677. cur = build_ffn(cur,
  8678. model.layers[il].ffn_up, NULL, NULL,
  8679. model.layers[il].ffn_gate, NULL, NULL,
  8680. model.layers[il].ffn_down, NULL, NULL,
  8681. NULL,
  8682. LLM_FFN_GELU, LLM_FFN_PAR, il);
  8683. cb(cur, "ffn_out", il);
  8684. }
  8685. cur = build_norm(cur,
  8686. model.layers[il].ffn_post_norm, NULL,
  8687. LLM_NORM_RMS, -1);
  8688. cb(cur, "ffn_post_norm", -1);
  8689. cur = ggml_add(ctx0, cur, sa_out);
  8690. cur = build_cvec(cur, il);
  8691. cb(cur, "l_out", il);
  8692. // input for next layer
  8693. inpL = cur;
  8694. }
  8695. cur = inpL;
  8696. cur = build_norm(cur,
  8697. model.output_norm, NULL,
  8698. LLM_NORM_RMS, -1);
  8699. cb(cur, "result_norm", -1);
  8700. res->t_embd = cur;
  8701. // lm_head
  8702. cur = build_lora_mm(model.output, cur);
  8703. cb(cur, "result_output", -1);
  8704. res->t_logits = cur;
  8705. ggml_build_forward_expand(gf, cur);
  8706. }
  8707. };
  8708. struct llm_build_gemma3n_iswa : public llm_graph_context {
  8709. const llama_model & model;
  8710. const int64_t n_embd_head;
  8711. const int64_t n_embd_altup;
  8712. const int64_t n_altup;
  8713. const int i_altup_act;
  8714. const int n_layer_sparsity = 10; // number of layers using activation sparsity
  8715. const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
  8716. llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params)
  8717. : llm_graph_context(params),
  8718. model(model),
  8719. n_embd_head(model.hparams.n_embd_head_k),
  8720. n_embd_altup(model.hparams.n_embd_altup),
  8721. n_altup(model.hparams.n_altup),
  8722. i_altup_act(model.hparams.i_altup_act) {
  8723. ggml_tensor * cur;
  8724. ggml_tensor * inpL;
  8725. inpL = build_inp_embd(model.tok_embd);
  8726. // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
  8727. if (ubatch.token) {
  8728. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  8729. cb(inpL, "inp_scaled", -1);
  8730. }
  8731. // inp_pos - contains the positions
  8732. ggml_tensor * inp_pos = build_inp_pos();
  8733. // TODO: is causal == true correct? might need some changes
  8734. auto * inp_attn = build_attn_inp_kv_iswa();
  8735. // inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
  8736. ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
  8737. // inpL now has only 1 altup, project it to the rest of the altups
  8738. // these "added" altups will be concat to the last dim of inpL
  8739. {
  8740. ggml_tensor * target_magnitude = calc_magnitude(inpL);
  8741. ggml_tensor * inp_repeated = ggml_repeat_4d(ctx0, inpL, n_embd, n_tokens, n_altup - 1, 1);
  8742. ggml_tensor * altup_added = ggml_mul_mat(ctx0, model.altup_proj, inp_repeated); // shape: [n_embd, n_tokens, n_altup - 1]
  8743. ggml_tensor * new_magnitude = calc_magnitude(altup_added);
  8744. altup_added = ggml_div(ctx0,
  8745. ggml_mul(ctx0, altup_added, target_magnitude),
  8746. new_magnitude);
  8747. inpL = ggml_concat(ctx0, inpL, altup_added, 2); // shape: [n_embd, n_tokens, n_altup]
  8748. cb(inpL, "inp_stacked", -1);
  8749. }
  8750. // inpL now has shape: [n_embd, n_tokens, n_altup]
  8751. // inp_per_layer now has shape: [n_embd_altup, n_tokens, n_layer]
  8752. for (int il = 0; il < n_layer; ++il) {
  8753. // this block is made to be closely resemble Gemma3p5DecoderLayer on python code
  8754. const float freq_base_l = model.get_rope_freq_base (cparams, il);
  8755. const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
  8756. ggml_tensor * cur = inpL; // [n_embd, n_tokens, n_altup]
  8757. ggml_tensor * predictions = altup_predict(cur, il); // [n_embd, n_tokens, n_altup]
  8758. // predicted value will go through self-attention and laurel
  8759. ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act); // [n_embd, n_tokens]
  8760. cur = active_prediction;
  8761. cb(cur, "active_prediction", il);
  8762. // norm
  8763. cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  8764. cb(cur, "attn_norm", il);
  8765. // laurel
  8766. ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens]
  8767. // self-attention
  8768. if (hparams.has_kv(il)) {
  8769. // compute Q and K and RoPE them
  8770. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8771. cb(Qcur, "Qcur", il);
  8772. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8773. cb(Kcur, "Kcur", il);
  8774. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8775. cb(Vcur, "Vcur", il);
  8776. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8777. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8778. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8779. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  8780. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  8781. Vcur = ggml_rms_norm(ctx0, Vcur, hparams.f_norm_rms_eps);
  8782. cb(Qcur, "Qcur_normed", il);
  8783. cb(Kcur, "Kcur_normed", il);
  8784. cb(Vcur, "Vcur_normed", il);
  8785. Qcur = ggml_rope_ext(
  8786. ctx0, Qcur, inp_pos, nullptr,
  8787. n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
  8788. ext_factor, attn_factor, beta_fast, beta_slow);
  8789. Kcur = ggml_rope_ext(
  8790. ctx0, Kcur, inp_pos, nullptr,
  8791. n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
  8792. ext_factor, attn_factor, beta_fast, beta_slow);
  8793. cb(Qcur, "Qcur_pos", il);
  8794. cb(Kcur, "Kcur_pos", il);
  8795. cur = build_attn(inp_attn,
  8796. model.layers[il].wo, NULL,
  8797. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
  8798. } else {
  8799. // reuse KV cache of earlier layers
  8800. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8801. cb(Qcur, "Qcur", il);
  8802. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8803. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  8804. cb(Qcur, "Qcur_normed", il);
  8805. Qcur = ggml_rope_ext(
  8806. ctx0, Qcur, inp_pos, nullptr,
  8807. n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
  8808. ext_factor, attn_factor, beta_fast, beta_slow);
  8809. cb(Qcur, "Qcur_pos", il);
  8810. cur = build_attn(inp_attn,
  8811. model.layers[il].wo, NULL,
  8812. Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
  8813. }
  8814. cur = build_norm(cur,
  8815. model.layers[il].attn_post_norm, NULL,
  8816. LLM_NORM_RMS, il);
  8817. cb(cur, "attn_post_norm", il);
  8818. cur = ggml_add(ctx0, cur, active_prediction); // [n_embd, n_tokens]
  8819. cb(cur, "attn_gated", il);
  8820. ggml_tensor * attn_laurel = ggml_scale(ctx0,
  8821. ggml_add(ctx0, cur, laurel_out),
  8822. 1.0f / sqrtf(2.0f)); // [n_embd, n_tokens]
  8823. cb(attn_laurel, "attn_laurel", il);
  8824. cur = build_norm(attn_laurel,
  8825. model.layers[il].ffn_norm, NULL,
  8826. LLM_NORM_RMS, il);
  8827. cb(cur, "ffn_norm", il);
  8828. // feed-forward network
  8829. {
  8830. ggml_tensor * up_proj = build_lora_mm(model.layers[il].ffn_up, cur);
  8831. ggml_tensor * gate_proj = build_lora_mm(model.layers[il].ffn_gate, cur);
  8832. if (il < n_layer_sparsity) {
  8833. // apply activation sparsity
  8834. gate_proj = gaussian_topk(gate_proj);
  8835. }
  8836. gate_proj = ggml_gelu(ctx0, gate_proj);
  8837. cur = ggml_mul(ctx0, up_proj, gate_proj);
  8838. cur = build_lora_mm(model.layers[il].ffn_down, cur);
  8839. cb(cur, "ffn_out", il);
  8840. }
  8841. cur = build_norm(cur,
  8842. model.layers[il].ffn_post_norm, NULL,
  8843. LLM_NORM_RMS, -1);
  8844. cb(cur, "ffn_post_norm", il);
  8845. ggml_tensor * attn_ffw_laurel_gated = ggml_add(ctx0, cur, attn_laurel); // [n_embd, n_tokens]
  8846. cb(attn_ffw_laurel_gated, "attn_ffw_laurel_gated", il);
  8847. ggml_tensor * corrected = altup_correct(predictions, attn_ffw_laurel_gated, il); // [n_embd, n_tokens, n_altup]
  8848. ggml_tensor * first_prediction; // [n_embd, n_tokens]
  8849. {
  8850. first_prediction = view_2d_slice(corrected, i_altup_act); // [n_embd, n_tokens]
  8851. first_prediction = ggml_mul(ctx0, first_prediction, model.layers[il].altup_correct_scale);
  8852. first_prediction = build_lora_mm(model.layers[il].per_layer_inp_gate, first_prediction);
  8853. first_prediction = ggml_gelu(ctx0, first_prediction); // [n_embd_altup, n_tokens]
  8854. cb(first_prediction, "first_prediction_gated", il);
  8855. ggml_tensor * inp_this_layer = view_2d_slice(inp_per_layer, il); // [n_embd_altup, n_tokens]
  8856. first_prediction = ggml_mul(ctx0, first_prediction, inp_this_layer); // [n_embd_altup, n_tokens]
  8857. cb(first_prediction, "first_prediction_scaled", il);
  8858. first_prediction = build_lora_mm(model.layers[il].per_layer_proj, first_prediction); // [n_embd, n_tokens]
  8859. first_prediction = build_norm(first_prediction,
  8860. model.layers[il].per_layer_post_norm, NULL,
  8861. LLM_NORM_RMS, il);
  8862. cb(first_prediction, "first_prediction_out", il);
  8863. }
  8864. // equivalent to python code: corrected_predictions[1:] += first_prediction
  8865. {
  8866. ggml_tensor * slice_first = view_2d_slice(corrected, 0);
  8867. ggml_tensor * slice_rest = ggml_view_3d(ctx0, corrected, n_embd, n_tokens, n_altup - 1,
  8868. ggml_row_size(corrected->type, n_embd),
  8869. ggml_row_size(corrected->type, n_embd*n_tokens),
  8870. n_embd*n_tokens*ggml_element_size(corrected));
  8871. ggml_tensor * tmp = ggml_add(ctx0, slice_rest, first_prediction); // [n_embd, n_tokens, n_altup - 1]
  8872. corrected = ggml_concat(ctx0, slice_first, tmp, 2); // [n_embd, n_tokens, n_altup]
  8873. }
  8874. cur = corrected; // [n_embd, n_tokens, n_altup]
  8875. cur = build_cvec(cur, il);
  8876. cb(cur, "l_out", il);
  8877. // input for next layer
  8878. inpL = cur;
  8879. }
  8880. cur = inpL; // [n_embd, n_tokens, n_altup]
  8881. // cur now has multiple altup(s), we want to merge them back to 1 altup
  8882. {
  8883. ggml_tensor * target_magnitude = calc_magnitude(view_2d_slice(cur, i_altup_act)); // [n_embd, n_tokens]
  8884. // do a view to skip the first slice (active altup)
  8885. ggml_tensor * alt_slice = ggml_view_3d(ctx0, cur, n_embd, n_tokens, n_altup - 1,
  8886. ggml_row_size(cur->type, n_embd),
  8887. ggml_row_size(cur->type, n_embd*n_tokens),
  8888. n_embd*n_tokens*ggml_element_size(cur));
  8889. ggml_tensor * altup_unembd = ggml_mul_mat(ctx0, model.altup_unembd_proj, alt_slice); // shape: [n_embd, n_tokens, n_altup - 1]
  8890. ggml_tensor * new_magnitude = calc_magnitude(altup_unembd);
  8891. altup_unembd = ggml_div(ctx0,
  8892. ggml_mul(ctx0, altup_unembd, target_magnitude),
  8893. new_magnitude);
  8894. cb(altup_unembd, "altup_unembd", -1);
  8895. // equivalent to torch.mean(hidden_states, dim=0)
  8896. cur = view_2d_slice(cur, 0); // [n_embd, n_tokens]
  8897. for (int i = 0; i < n_altup - 1; ++i) {
  8898. cur = ggml_add(ctx0, cur, view_2d_slice(altup_unembd, i));
  8899. }
  8900. cur = ggml_scale(ctx0, cur, 1.0f / float(n_altup)); // [n_embd, n_tokens]
  8901. cb(cur, "unembd_merged", -1);
  8902. }
  8903. // cur now has shape: [n_embd, n_tokens]
  8904. // TODO: move this to right after the last KV layer
  8905. {
  8906. // skip computing output for unused tokens
  8907. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8908. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8909. }
  8910. cur = build_norm(cur,
  8911. model.output_norm, NULL,
  8912. LLM_NORM_RMS, -1);
  8913. cb(cur, "result_norm", -1);
  8914. res->t_embd = cur;
  8915. cur = build_lora_mm(model.output, cur);
  8916. {
  8917. // final logit soft-capping
  8918. cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
  8919. cur = ggml_tanh(ctx0, cur);
  8920. cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
  8921. }
  8922. cb(cur, "result_output", -1);
  8923. res->t_logits = cur;
  8924. ggml_build_forward_expand(gf, cur);
  8925. }
  8926. ggml_tensor * calc_magnitude(ggml_tensor * x) {
  8927. return ggml_sqrt(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, x)));
  8928. }
  8929. // get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim
  8930. ggml_tensor * view_2d_slice(ggml_tensor * x, int idx) {
  8931. GGML_ASSERT(idx < (int)x->ne[2]);
  8932. return ggml_view_2d(ctx0, x, x->ne[0], x->ne[1],
  8933. ggml_row_size(x->type, x->ne[0]),
  8934. idx * x->ne[0] * x->ne[1] * ggml_element_size(x));
  8935. }
  8936. // equivalent to get_per_layer_inputs() in python code
  8937. // output shape: [n_embd_altup, n_layer, n_tokens]
  8938. ggml_tensor * get_per_layer_inputs() {
  8939. auto inp = std::make_unique<llm_graph_input_embd>();
  8940. ggml_tensor * inp_per_layer;
  8941. if (ubatch.token) {
  8942. inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
  8943. ggml_set_input(inp->tokens);
  8944. res->t_tokens = inp->tokens;
  8945. inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
  8946. inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
  8947. inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float)n_embd_altup));
  8948. cb(inp_per_layer, "inp_per_layer_selected", -1);
  8949. } else {
  8950. GGML_ABORT("TODO: support embd input");
  8951. }
  8952. res->add_input(std::move(inp));
  8953. return inp_per_layer;
  8954. }
  8955. // equivalent to project_per_layer_inputs() in python code
  8956. // this calculates the per-layer inputs, so the final tensor shape will have n_layer as the last dim
  8957. // output shape: [n_embd_altup, n_tokens, n_layer]
  8958. ggml_tensor * project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer) {
  8959. const float per_layer_projection_scale = 1.0f / sqrtf((float)n_embd);
  8960. const float per_layer_input_scale = 1.0f / sqrtf(2.0f);
  8961. ggml_tensor * per_layer_proj = ggml_mul_mat(ctx0, model.per_layer_model_proj, inputs_embeds);
  8962. per_layer_proj = ggml_scale(ctx0, per_layer_proj, per_layer_projection_scale);
  8963. per_layer_proj = ggml_reshape_3d(ctx0, per_layer_proj, n_embd_altup, n_layer, n_tokens);
  8964. per_layer_proj = build_norm(per_layer_proj,
  8965. model.per_layer_proj_norm, NULL,
  8966. LLM_NORM_RMS, -1); // [n_embd_altup, n_layer, n_tokens]
  8967. cb(per_layer_proj, "per_layer_proj", -1);
  8968. inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
  8969. inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
  8970. cb(inp_per_layer, "inp_per_layer", -1);
  8971. // permute to shape: [n_embd_altup, n_tokens, n_layer]
  8972. inp_per_layer = ggml_cont(ctx0, ggml_permute(ctx0, inp_per_layer, 0, 2, 1, 3));
  8973. return inp_per_layer;
  8974. }
  8975. // input cur shape: [n_altup, n_tokens]
  8976. // output shape: [n_altup, n_tokens]
  8977. ggml_tensor * laurel(ggml_tensor * cur, int il) {
  8978. ggml_tensor * tmp = cur;
  8979. tmp = build_lora_mm(model.layers[il].laurel_l, tmp);
  8980. tmp = build_lora_mm(model.layers[il].laurel_r, tmp);
  8981. tmp = build_norm(tmp, model.layers[il].laurel_post_norm, NULL, LLM_NORM_RMS, il);
  8982. tmp = ggml_add(ctx0, tmp, cur);
  8983. cb(tmp, "laurel_out", il);
  8984. return tmp;
  8985. }
  8986. // input x shape: [n_embd, n_tokens]
  8987. // output shape: [n_embd, n_tokens]
  8988. ggml_tensor * gaussian_topk(ggml_tensor * x) {
  8989. ggml_tensor * mean = ggml_mean(ctx0, x);
  8990. ggml_tensor * std = ggml_sqrt(ctx0, ggml_scale(ctx0,
  8991. ggml_sum_rows(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x, mean))),
  8992. 1.0f / (float)(x->ne[0] - 1)
  8993. ));
  8994. ggml_tensor * cutoff_x = ggml_add(ctx0, mean, ggml_scale(ctx0, std, f_sparsity_std_mul));
  8995. return ggml_relu(ctx0, ggml_sub(ctx0, x, cutoff_x));
  8996. }
  8997. //
  8998. // altup functions
  8999. //
  9000. // equivalent to compute_router_modalities() in python code
  9001. // input x shape: [n_embd, n_tokens]
  9002. // output shape: [n_altup, n_tokens]
  9003. ggml_tensor * altup_compute_router_modalities(ggml_tensor * x, int il) {
  9004. ggml_tensor * router_inputs = build_norm(x,
  9005. model.layers[il].altup_router_norm, NULL,
  9006. LLM_NORM_RMS, il);
  9007. // router_input_scale
  9008. router_inputs = ggml_scale(ctx0, router_inputs, 1.0f / (float)n_embd);
  9009. ggml_tensor * output = ggml_mul_mat(ctx0, model.layers[il].altup_router, router_inputs);
  9010. return ggml_tanh(ctx0, output); // [n_altup, n_tokens]
  9011. }
  9012. // input cur shape: [n_embd, n_tokens, n_altup]
  9013. // output shape: [n_embd, n_tokens, n_altup]
  9014. ggml_tensor * altup_predict(ggml_tensor * cur, int il) {
  9015. ggml_tensor * activated = view_2d_slice(cur, i_altup_act); // [n_embd, n_tokens]
  9016. ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
  9017. cb(modalities, "modalities", il);
  9018. ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_predict_coef, modalities);
  9019. cb(all_coefs, "all_coefs", il);
  9020. // first dim now having n_altup^2 elements, we reshape it to 2D (so we end up with 3D tensor)
  9021. all_coefs = ggml_reshape_3d(ctx0, all_coefs, n_altup, n_altup, n_tokens);
  9022. // permute to [n_altup, n_embd, n_tokens]
  9023. ggml_tensor * cur_permuted = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
  9024. ggml_tensor * predictions = ggml_mul_mat(ctx0, cur_permuted, all_coefs); // [n_altup, n_embd, n_tokens]
  9025. // final shape must be the same as cur: [n_embd, n_tokens, n_altup]
  9026. predictions = ggml_cont(ctx0, ggml_permute(ctx0, predictions, 0, 2, 1, 3));
  9027. predictions = ggml_add(ctx0, predictions, cur);
  9028. cb(predictions, "predictions", il);
  9029. return predictions;
  9030. }
  9031. // input predictions shape: [n_embd, n_tokens, n_altup]
  9032. // input activated shape: [n_embd, n_tokens]
  9033. // output shape: [n_embd, n_tokens, n_altup]
  9034. ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il) {
  9035. ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
  9036. cb(modalities, "modalities", il);
  9037. ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act);
  9038. ggml_tensor * innovation = ggml_sub(ctx0, activated, active_prediction); // [n_embd, n_tokens]
  9039. cb(innovation, "innovation", il);
  9040. ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
  9041. all_coefs = ggml_scale_bias(ctx0, all_coefs, 1.0f, 1.0f); // + 1.0
  9042. cb(all_coefs, "all_coefs", il);
  9043. all_coefs = ggml_transpose(ctx0, all_coefs); // [n_tokens, n_altup]
  9044. all_coefs = ggml_cont_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
  9045. innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1);
  9046. ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup]
  9047. corrected = ggml_add(ctx0, corrected, predictions); // [n_embd, n_tokens, n_altup]
  9048. cb(corrected, "corrected", il);
  9049. return corrected;
  9050. }
  9051. };
  9052. struct llm_build_gemma_embedding_iswa : public llm_graph_context {
  9053. llm_build_gemma_embedding_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  9054. const int64_t n_embd_head = hparams.n_embd_head_k;
  9055. ggml_tensor * cur;
  9056. ggml_tensor * inpL;
  9057. inpL = build_inp_embd(model.tok_embd);
  9058. // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
  9059. if (ubatch.token) {
  9060. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  9061. cb(inpL, "inp_scaled", -1);
  9062. }
  9063. // inp_pos - contains the positions
  9064. ggml_tensor * inp_pos = build_inp_pos();
  9065. // TODO: support cacheless iSWA embeddings [TAG_NO_CACHE_ISWA]
  9066. auto * inp_attn = build_attn_inp_kv_iswa();
  9067. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9068. for (int il = 0; il < n_layer; ++il) {
  9069. const float freq_base_l = model.get_rope_freq_base (cparams, il);
  9070. const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
  9071. // norm
  9072. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  9073. cb(cur, "attn_norm", il);
  9074. // self-attention
  9075. {
  9076. // compute Q and K and RoPE them
  9077. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  9078. cb(Qcur, "Qcur", il);
  9079. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  9080. cb(Kcur, "Kcur", il);
  9081. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  9082. cb(Vcur, "Vcur", il);
  9083. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9084. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9085. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9086. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  9087. cb(Qcur, "Qcur_normed", il);
  9088. Qcur = ggml_rope_ext(
  9089. ctx0, Qcur, inp_pos, nullptr,
  9090. n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
  9091. ext_factor, attn_factor, beta_fast, beta_slow);
  9092. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  9093. cb(Kcur, "Kcur_normed", il);
  9094. Kcur = ggml_rope_ext(
  9095. ctx0, Kcur, inp_pos, nullptr,
  9096. n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
  9097. ext_factor, attn_factor, beta_fast, beta_slow);
  9098. cb(Qcur, "Qcur", il);
  9099. cb(Kcur, "Kcur", il);
  9100. cb(Vcur, "Vcur", il);
  9101. // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
  9102. Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
  9103. cur = build_attn(inp_attn,
  9104. model.layers[il].wo, NULL,
  9105. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
  9106. }
  9107. if (il == n_layer - 1 && inp_out_ids) {
  9108. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9109. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  9110. }
  9111. cur = build_norm(cur,
  9112. model.layers[il].attn_post_norm, NULL,
  9113. LLM_NORM_RMS, il);
  9114. cb(cur, "attn_post_norm", il);
  9115. ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
  9116. cb(sa_out, "sa_out", il);
  9117. cur = build_norm(sa_out,
  9118. model.layers[il].ffn_norm, NULL,
  9119. LLM_NORM_RMS, il);
  9120. cb(cur, "ffn_norm", il);
  9121. // feed-forward network
  9122. {
  9123. cur = build_ffn(cur,
  9124. model.layers[il].ffn_up, NULL, NULL,
  9125. model.layers[il].ffn_gate, NULL, NULL,
  9126. model.layers[il].ffn_down, NULL, NULL,
  9127. NULL,
  9128. LLM_FFN_GELU, LLM_FFN_PAR, il);
  9129. cb(cur, "ffn_out", il);
  9130. }
  9131. cur = build_norm(cur,
  9132. model.layers[il].ffn_post_norm, NULL,
  9133. LLM_NORM_RMS, -1);
  9134. cb(cur, "ffn_post_norm", -1);
  9135. cur = ggml_add(ctx0, cur, sa_out);
  9136. cur = build_cvec(cur, il);
  9137. cb(cur, "l_out", il);
  9138. // input for next layer
  9139. inpL = cur;
  9140. }
  9141. cur = inpL;
  9142. cur = build_norm(cur,
  9143. model.output_norm, NULL,
  9144. LLM_NORM_RMS, -1);
  9145. cb(cur, "result_norm", -1);
  9146. res->t_embd = cur;
  9147. ggml_build_forward_expand(gf, cur);
  9148. }
  9149. };
  9150. // TODO: move up next to build_starcoder
  9151. struct llm_build_starcoder2 : public llm_graph_context {
  9152. llm_build_starcoder2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  9153. const int64_t n_embd_head = hparams.n_embd_head_v;
  9154. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9155. GGML_ASSERT(n_embd_head == hparams.n_rot);
  9156. ggml_tensor * cur;
  9157. ggml_tensor * inpL;
  9158. inpL = build_inp_embd(model.tok_embd);
  9159. // inp_pos - contains the positions
  9160. ggml_tensor * inp_pos = build_inp_pos();
  9161. auto * inp_attn = build_attn_inp_kv();
  9162. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9163. for (int il = 0; il < n_layer; ++il) {
  9164. ggml_tensor * inpSA = inpL;
  9165. // norm
  9166. cur = build_norm(inpL,
  9167. model.layers[il].attn_norm, model.layers[il].attn_norm_b,
  9168. LLM_NORM, il);
  9169. cb(cur, "attn_norm", il);
  9170. // self-attention
  9171. {
  9172. // compute Q and K and RoPE them
  9173. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  9174. cb(Qcur, "Qcur", il);
  9175. if (model.layers[il].bq) {
  9176. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  9177. cb(Qcur, "Qcur", il);
  9178. }
  9179. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  9180. cb(Kcur, "Kcur", il);
  9181. if (model.layers[il].bk) {
  9182. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  9183. cb(Kcur, "Kcur", il);
  9184. }
  9185. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  9186. cb(Vcur, "Vcur", il);
  9187. if (model.layers[il].bv) {
  9188. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  9189. cb(Vcur, "Vcur", il);
  9190. }
  9191. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9192. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9193. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9194. Qcur = ggml_rope_ext(
  9195. ctx0, Qcur, inp_pos, nullptr,
  9196. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9197. ext_factor, attn_factor, beta_fast, beta_slow
  9198. );
  9199. Kcur = ggml_rope_ext(
  9200. ctx0, Kcur, inp_pos, nullptr,
  9201. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9202. ext_factor, attn_factor, beta_fast, beta_slow
  9203. );
  9204. cb(Qcur, "Qcur", il);
  9205. cb(Kcur, "Kcur", il);
  9206. cb(Vcur, "Vcur", il);
  9207. cur = build_attn(inp_attn,
  9208. model.layers[il].wo, model.layers[il].bo,
  9209. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  9210. }
  9211. if (il == n_layer - 1 && inp_out_ids) {
  9212. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9213. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9214. }
  9215. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9216. cb(ffn_inp, "ffn_inp", il);
  9217. // feed-forward network
  9218. cur = build_norm(ffn_inp,
  9219. model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
  9220. LLM_NORM, il);
  9221. cb(cur, "ffn_norm", il);
  9222. cur = build_ffn(cur,
  9223. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  9224. NULL, NULL, NULL,
  9225. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  9226. NULL,
  9227. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  9228. cb(cur, "ffn_out", il);
  9229. cur = ggml_add(ctx0, cur, ffn_inp);
  9230. cur = build_cvec(cur, il);
  9231. cb(cur, "l_out", il);
  9232. // input for next layer
  9233. inpL = cur;
  9234. }
  9235. cur = inpL;
  9236. cur = build_norm(cur,
  9237. model.output_norm, model.output_norm_b,
  9238. LLM_NORM, -1);
  9239. cb(cur, "result_norm", -1);
  9240. res->t_embd = cur;
  9241. // lm_head
  9242. cur = build_lora_mm(model.output, cur);
  9243. cb(cur, "result_output", -1);
  9244. res->t_logits = cur;
  9245. ggml_build_forward_expand(gf, cur);
  9246. }
  9247. };
  9248. struct llm_graph_context_mamba : public llm_graph_context {
  9249. llm_graph_context_mamba(const llm_graph_params & params) : llm_graph_context(params) {}
  9250. ggml_tensor * build_mamba_layer(
  9251. llm_graph_input_rs * inp,
  9252. ggml_tensor * cur,
  9253. const llama_model & model,
  9254. const llama_ubatch & ubatch,
  9255. int il) {
  9256. const auto * mctx_cur = inp->mctx;
  9257. const auto kv_head = mctx_cur->get_head();
  9258. const auto & layer = model.layers[il];
  9259. const int64_t d_conv = hparams.ssm_d_conv;
  9260. const int64_t d_inner = hparams.ssm_d_inner;
  9261. const int64_t d_state = hparams.ssm_d_state;
  9262. const int64_t dt_rank = hparams.ssm_dt_rank;
  9263. const int64_t n_head = d_inner;
  9264. const int64_t head_dim = 1;
  9265. const int64_t n_seqs = ubatch.n_seqs;
  9266. // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
  9267. const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
  9268. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  9269. GGML_ASSERT(n_seqs != 0);
  9270. GGML_ASSERT(ubatch.equal_seqs());
  9271. GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
  9272. ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
  9273. ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
  9274. ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
  9275. conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
  9276. // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
  9277. cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
  9278. // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
  9279. ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur);
  9280. // split the above in two
  9281. // => {d_inner, n_seq_tokens, n_seqs}
  9282. ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
  9283. ggml_tensor * z = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz));
  9284. // conv
  9285. {
  9286. // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
  9287. ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
  9288. // copy last (d_conv - 1) columns back into the state cache
  9289. ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
  9290. ggml_build_forward_expand(gf,
  9291. ggml_cpy(ctx0, last_conv,
  9292. ggml_view_1d(ctx0, conv_states_all,
  9293. (d_conv - 1)*(d_inner)*(n_seqs),
  9294. kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
  9295. // 1D convolution
  9296. // The equivalent is to make a self-overlapping view of conv_x
  9297. // over d_conv columns at each stride in the 3rd dimension,
  9298. // then element-wise multiply that with the conv1d weight,
  9299. // then sum the elements of each row,
  9300. // (the last two steps are a dot product over rows (also doable with mul_mat))
  9301. // then permute away the ne[0] dimension,
  9302. // and then you're left with the resulting x tensor.
  9303. // For simultaneous sequences, all sequences need to have the same length.
  9304. x = ggml_ssm_conv(ctx0, conv_x, layer.ssm_conv1d);
  9305. // bias
  9306. x = ggml_add(ctx0, x, layer.ssm_conv1d_b);
  9307. x = ggml_silu(ctx0, x);
  9308. }
  9309. // ssm
  9310. {
  9311. // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
  9312. ggml_tensor * x_db = build_lora_mm(layer.ssm_x, x);
  9313. // split
  9314. ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
  9315. ggml_tensor * B = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
  9316. ggml_tensor * C = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
  9317. // Some Mamba variants (e.g. FalconMamba, Jamba) apply RMS norm in B, C & Dt layers
  9318. if (ssm_dt_b_c_rms || (layer.ssm_dt_norm && layer.ssm_b_norm && layer.ssm_c_norm)) {
  9319. dt = build_norm(dt, layer.ssm_dt_norm, NULL, LLM_NORM_RMS, il);
  9320. B = build_norm(B, layer.ssm_b_norm, NULL, LLM_NORM_RMS, il);
  9321. C = build_norm(C, layer.ssm_c_norm, NULL, LLM_NORM_RMS, il);
  9322. }
  9323. // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
  9324. dt = build_lora_mm(layer.ssm_dt, dt);
  9325. dt = ggml_add(ctx0, dt, layer.ssm_dt_b);
  9326. cur = x;
  9327. x = ggml_reshape_4d(ctx0, x, head_dim, n_head, n_seq_tokens, n_seqs);
  9328. ggml_tensor * A = layer.ssm_a;
  9329. // use the states and the indices provided by build_recurrent_state
  9330. // (this is necessary in order to properly use the states before they are overwritten,
  9331. // while avoiding to make unnecessary copies of the states)
  9332. auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
  9333. ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
  9334. // Custom operator to optimize the parallel associative scan
  9335. // as described in the Annex D of the Mamba paper.
  9336. // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
  9337. return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
  9338. };
  9339. ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
  9340. // store last states
  9341. ggml_build_forward_expand(gf,
  9342. ggml_cpy(ctx0,
  9343. ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]),
  9344. ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
  9345. ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[2], x->nb[3], 0);
  9346. // TODO: skip computing output earlier for unused tokens
  9347. y = ggml_add(ctx0, y, ggml_mul(ctx0, cur, layer.ssm_d));
  9348. y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
  9349. // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
  9350. cur = build_lora_mm(layer.ssm_out, y);
  9351. }
  9352. // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
  9353. cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
  9354. return cur;
  9355. }
  9356. ggml_tensor * build_mamba2_layer(
  9357. llm_graph_input_rs * inp,
  9358. ggml_tensor * cur,
  9359. const llama_model & model,
  9360. const llama_ubatch & ubatch,
  9361. int il) const {
  9362. const auto * mctx_cur = inp->mctx;
  9363. const auto kv_head = mctx_cur->get_head();
  9364. const int64_t d_conv = hparams.ssm_d_conv;
  9365. const int64_t d_inner = hparams.ssm_d_inner;
  9366. const int64_t d_state = hparams.ssm_d_state;
  9367. const int64_t n_head = hparams.ssm_dt_rank;
  9368. const int64_t head_dim = d_inner / n_head;
  9369. const int64_t n_group = hparams.ssm_n_group;
  9370. const int64_t n_seqs = ubatch.n_seqs;
  9371. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  9372. GGML_ASSERT(n_seqs != 0);
  9373. GGML_ASSERT(ubatch.equal_seqs());
  9374. GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
  9375. ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
  9376. ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
  9377. ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
  9378. conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
  9379. // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
  9380. cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
  9381. // d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
  9382. // {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
  9383. ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
  9384. // split the above in three
  9385. ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*zxBCdt->nb[0], zxBCdt->nb[1], zxBCdt->nb[2], 0);
  9386. ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_inner + 2*n_group*d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], d_inner*ggml_element_size(zxBCdt));
  9387. ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], (2*d_inner + 2*n_group*d_state)*ggml_element_size(zxBCdt));
  9388. // conv
  9389. {
  9390. // => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs}
  9391. ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0);
  9392. // copy last (d_conv - 1) columns back into the state cache
  9393. ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
  9394. ggml_build_forward_expand(gf,
  9395. ggml_cpy(ctx0, last_conv,
  9396. ggml_view_1d(ctx0, conv_states_all,
  9397. (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
  9398. kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
  9399. // 1D convolution
  9400. // The equivalent is to make a self-overlapping view of conv_x
  9401. // over d_conv columns at each stride in the 3rd dimension,
  9402. // then element-wise multiply that with the conv1d weight,
  9403. // then sum the elements of each row,
  9404. // (the last two steps are a dot product over rows (also doable with mul_mat))
  9405. // then permute away the ne[0] dimension,
  9406. // and then you're left with the resulting x tensor.
  9407. // For simultaneous sequences, all sequences need to have the same length.
  9408. xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
  9409. // bias
  9410. xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b);
  9411. xBC = ggml_silu(ctx0, xBC);
  9412. }
  9413. // ssm
  9414. {
  9415. // These correspond to V K Q in SSM/attention duality
  9416. ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*xBC->nb[0], xBC->nb[1], xBC->nb[2], 0);
  9417. ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], d_inner*ggml_element_size(xBC));
  9418. ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], (d_inner + n_group*d_state)*ggml_element_size(xBC));
  9419. // {n_head, n_seq_tokens, n_seqs}
  9420. dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b);
  9421. ggml_tensor * A = model.layers[il].ssm_a;
  9422. // use the states and the indices provided by build_recurrent_state
  9423. // (this is necessary in order to properly use the states before they are overwritten,
  9424. // while avoiding to make unnecessary copies of the states)
  9425. auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
  9426. ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
  9427. // TODO: use semistructured matrices to implement state-space duality
  9428. // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
  9429. return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
  9430. };
  9431. ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
  9432. // store last states
  9433. ggml_build_forward_expand(gf,
  9434. ggml_cpy(ctx0,
  9435. ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, ggml_nelements(x)*x->nb[0]),
  9436. ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
  9437. ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head*x->nb[1], n_seq_tokens*n_head*x->nb[1], 0);
  9438. // TODO: skip computing output earlier for unused tokens
  9439. y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
  9440. y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
  9441. // grouped RMS norm
  9442. if (model.layers[il].ssm_norm) {
  9443. y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
  9444. y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
  9445. }
  9446. y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
  9447. // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
  9448. cur = build_lora_mm(model.layers[il].ssm_out, y);
  9449. }
  9450. // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
  9451. cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
  9452. cb(cur, "mamba_out", il);
  9453. return cur;
  9454. }
  9455. };
  9456. struct llm_build_mamba : public llm_graph_context_mamba {
  9457. llm_build_mamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
  9458. ggml_tensor * cur;
  9459. ggml_tensor * inpL;
  9460. // {n_embd, n_tokens}
  9461. inpL = build_inp_embd(model.tok_embd);
  9462. auto * rs_inp = build_rs_inp();
  9463. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9464. for (int il = 0; il < n_layer; ++il) {
  9465. // norm
  9466. cur = build_norm(inpL,
  9467. model.layers[il].attn_norm, NULL,
  9468. LLM_NORM_RMS, il);
  9469. cb(cur, "attn_norm", il);
  9470. if (model.arch == LLM_ARCH_MAMBA2) {
  9471. cur = build_mamba2_layer(rs_inp, cur, model, ubatch, il);
  9472. } else {
  9473. cur = build_mamba_layer(rs_inp, cur, model, ubatch, il);
  9474. }
  9475. if (il == n_layer - 1 && inp_out_ids) {
  9476. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9477. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  9478. }
  9479. // residual
  9480. cur = ggml_add(ctx0, cur, inpL);
  9481. cur = build_cvec(cur, il);
  9482. cb(cur, "l_out", il);
  9483. // input for next layer
  9484. inpL = cur;
  9485. }
  9486. // final rmsnorm
  9487. cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
  9488. cb(cur, "result_norm", -1);
  9489. res->t_embd = cur;
  9490. // lm_head
  9491. cur = build_lora_mm(model.output, cur);
  9492. cb(cur, "result_output", -1);
  9493. res->t_logits = cur;
  9494. ggml_build_forward_expand(gf, cur);
  9495. }
  9496. };
  9497. struct llm_build_jamba : public llm_graph_context_mamba {
  9498. llm_build_jamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
  9499. const int64_t n_embd_head = hparams.n_embd_head_v;
  9500. ggml_tensor * cur;
  9501. ggml_tensor * inpL;
  9502. // {n_embd, n_tokens}
  9503. inpL = build_inp_embd(model.tok_embd);
  9504. auto * inp_hybrid = build_inp_mem_hybrid();
  9505. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9506. for (int il = 0; il < n_layer; ++il) {
  9507. const int64_t n_head_kv = hparams.n_head_kv(il);
  9508. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  9509. cb(cur, "attn_norm", il);
  9510. if (n_head_kv == 0) {
  9511. cur = build_mamba_layer(inp_hybrid->get_recr(), cur, model, ubatch, il);
  9512. } else {
  9513. // Attention
  9514. struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  9515. struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  9516. struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  9517. cb(Qcur, "Qcur", il);
  9518. cb(Kcur, "Kcur", il);
  9519. cb(Vcur, "Vcur", il);
  9520. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9521. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9522. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9523. cb(Qcur, "Qcur", il);
  9524. cb(Kcur, "Kcur", il);
  9525. cb(Vcur, "Vcur", il);
  9526. // No RoPE :)
  9527. cur = build_attn(inp_hybrid->get_attn(),
  9528. model.layers[il].wo, NULL,
  9529. Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
  9530. }
  9531. if (il == n_layer - 1 && inp_out_ids) {
  9532. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9533. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  9534. }
  9535. // residual
  9536. struct ggml_tensor * ffn_inp = ggml_add(ctx0, inpL, cur);
  9537. cb(cur, "ffn_inp", il);
  9538. cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
  9539. cb(cur, "ffn_norm", il);
  9540. // feed-forward network
  9541. if (model.layers[il].ffn_gate_inp == nullptr) {
  9542. // FFN
  9543. cur = build_ffn(cur,
  9544. model.layers[il].ffn_up, NULL, NULL,
  9545. model.layers[il].ffn_gate, NULL, NULL,
  9546. model.layers[il].ffn_down, NULL, NULL,
  9547. NULL,
  9548. LLM_FFN_SILU, LLM_FFN_PAR, il);
  9549. cb(cur, "ffn_out", il);
  9550. } else {
  9551. // MoE branch
  9552. cur = build_moe_ffn(cur,
  9553. model.layers[il].ffn_gate_inp,
  9554. model.layers[il].ffn_up_exps,
  9555. model.layers[il].ffn_gate_exps,
  9556. model.layers[il].ffn_down_exps,
  9557. nullptr,
  9558. n_expert, n_expert_used,
  9559. LLM_FFN_SILU, false,
  9560. false, 0.0,
  9561. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  9562. il);
  9563. cb(cur, "ffn_moe_out", il);
  9564. }
  9565. // residual
  9566. cur = ggml_add(ctx0, ffn_inp, cur);
  9567. cur = build_cvec(cur, il);
  9568. cb(cur, "l_out", il);
  9569. // input for next layer
  9570. inpL = cur;
  9571. }
  9572. // final rmsnorm
  9573. cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
  9574. cb(cur, "result_norm", -1);
  9575. res->t_embd = cur;
  9576. // lm_head
  9577. cur = build_lora_mm(model.output, cur);
  9578. cb(cur, "result_output", -1);
  9579. res->t_logits = cur;
  9580. ggml_build_forward_expand(gf, cur);
  9581. }
  9582. };
  9583. struct llm_build_command_r : public llm_graph_context {
  9584. llm_build_command_r(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  9585. const int64_t n_embd_head = hparams.n_embd_head_v;
  9586. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9587. const float f_logit_scale = hparams.f_logit_scale;
  9588. ggml_tensor * cur;
  9589. ggml_tensor * inpL;
  9590. inpL = build_inp_embd(model.tok_embd);
  9591. // inp_pos - contains the positions
  9592. ggml_tensor * inp_pos = build_inp_pos();
  9593. auto * inp_attn = build_attn_inp_kv();
  9594. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9595. for (int il = 0; il < n_layer; ++il) {
  9596. // norm
  9597. cur = build_norm(inpL,
  9598. model.layers[il].attn_norm, NULL,
  9599. LLM_NORM, il);
  9600. cb(cur, "attn_norm", il);
  9601. ggml_tensor * ffn_inp = cur;
  9602. // self-attention
  9603. {
  9604. // compute Q and K and RoPE them
  9605. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  9606. cb(Qcur, "Qcur", il);
  9607. if (model.layers[il].bq) {
  9608. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  9609. cb(Qcur, "Qcur", il);
  9610. }
  9611. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  9612. cb(Kcur, "Kcur", il);
  9613. if (model.layers[il].bk) {
  9614. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  9615. cb(Kcur, "Kcur", il);
  9616. }
  9617. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  9618. cb(Vcur, "Vcur", il);
  9619. if (model.layers[il].bv) {
  9620. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  9621. cb(Vcur, "Vcur", il);
  9622. }
  9623. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9624. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9625. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9626. if (model.layers[il].attn_q_norm) {
  9627. Qcur = build_norm(Qcur,
  9628. model.layers[il].attn_q_norm,
  9629. NULL,
  9630. LLM_NORM, il);
  9631. cb(Qcur, "Qcur", il);
  9632. }
  9633. Qcur = ggml_rope_ext(
  9634. ctx0, Qcur, inp_pos, nullptr,
  9635. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9636. ext_factor, attn_factor, beta_fast, beta_slow
  9637. );
  9638. if (model.layers[il].attn_k_norm) {
  9639. Kcur = build_norm(Kcur,
  9640. model.layers[il].attn_k_norm,
  9641. NULL,
  9642. LLM_NORM, il);
  9643. cb(Kcur, "Kcur", il);
  9644. }
  9645. Kcur = ggml_rope_ext(
  9646. ctx0, Kcur, inp_pos, nullptr,
  9647. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9648. ext_factor, attn_factor, beta_fast, beta_slow
  9649. );
  9650. cb(Qcur, "Qcur", il);
  9651. cb(Kcur, "Kcur", il);
  9652. cb(Vcur, "Vcur", il);
  9653. cur = build_attn(inp_attn,
  9654. model.layers[il].wo, model.layers[il].bo,
  9655. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  9656. }
  9657. if (il == n_layer - 1 && inp_out_ids) {
  9658. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9659. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  9660. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  9661. }
  9662. ggml_tensor * attn_out = cur;
  9663. // feed-forward network
  9664. {
  9665. cur = build_ffn(ffn_inp,
  9666. model.layers[il].ffn_up, NULL, NULL,
  9667. model.layers[il].ffn_gate, NULL, NULL,
  9668. model.layers[il].ffn_down, NULL, NULL,
  9669. NULL,
  9670. LLM_FFN_SILU, LLM_FFN_PAR, il);
  9671. cb(cur, "ffn_out", il);
  9672. }
  9673. // add together residual + FFN + self-attention
  9674. cur = ggml_add(ctx0, cur, inpL);
  9675. cur = ggml_add(ctx0, cur, attn_out);
  9676. cur = build_cvec(cur, il);
  9677. cb(cur, "l_out", il);
  9678. // input for next layer
  9679. inpL = cur;
  9680. }
  9681. cur = inpL;
  9682. cur = build_norm(cur,
  9683. model.output_norm, NULL,
  9684. LLM_NORM, -1);
  9685. cb(cur, "result_norm", -1);
  9686. res->t_embd = cur;
  9687. // lm_head
  9688. cur = build_lora_mm(model.output, cur);
  9689. if (f_logit_scale) {
  9690. cur = ggml_scale(ctx0, cur, f_logit_scale);
  9691. }
  9692. cb(cur, "result_output", -1);
  9693. res->t_logits = cur;
  9694. ggml_build_forward_expand(gf, cur);
  9695. }
  9696. };
  9697. struct llm_build_cohere2_iswa : public llm_graph_context {
  9698. llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  9699. const int64_t n_embd_head = hparams.n_embd_head_v;
  9700. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9701. const float f_logit_scale = hparams.f_logit_scale;
  9702. ggml_tensor * cur;
  9703. ggml_tensor * inpL;
  9704. inpL = build_inp_embd(model.tok_embd);
  9705. // inp_pos - contains the positions
  9706. ggml_tensor * inp_pos = build_inp_pos();
  9707. auto * inp_attn = build_attn_inp_kv_iswa();
  9708. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9709. for (int il = 0; il < n_layer; ++il) {
  9710. const bool is_swa = hparams.is_swa(il);
  9711. // norm
  9712. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
  9713. cb(cur, "attn_norm", il);
  9714. ggml_tensor * ffn_inp = cur;
  9715. // self-attention
  9716. {
  9717. // rope freq factors for 128k context
  9718. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  9719. // compute Q and K and RoPE them
  9720. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  9721. cb(Qcur, "Qcur", il);
  9722. if (model.layers[il].bq) {
  9723. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  9724. cb(Qcur, "Qcur", il);
  9725. }
  9726. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  9727. cb(Kcur, "Kcur", il);
  9728. if (model.layers[il].bk) {
  9729. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  9730. cb(Kcur, "Kcur", il);
  9731. }
  9732. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  9733. cb(Vcur, "Vcur", il);
  9734. if (model.layers[il].bv) {
  9735. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  9736. cb(Vcur, "Vcur", il);
  9737. }
  9738. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9739. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9740. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9741. if (is_swa) {
  9742. Qcur = ggml_rope_ext(
  9743. ctx0, Qcur, inp_pos, rope_factors,
  9744. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9745. ext_factor, attn_factor, beta_fast, beta_slow
  9746. );
  9747. Kcur = ggml_rope_ext(
  9748. ctx0, Kcur, inp_pos, rope_factors,
  9749. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9750. ext_factor, attn_factor, beta_fast, beta_slow
  9751. );
  9752. }
  9753. cb(Qcur, "Qcur", il);
  9754. cb(Kcur, "Kcur", il);
  9755. cb(Vcur, "Vcur", il);
  9756. cur = build_attn(inp_attn,
  9757. model.layers[il].wo, model.layers[il].bo,
  9758. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  9759. }
  9760. if (il == n_layer - 1 && inp_out_ids) {
  9761. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9762. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  9763. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  9764. }
  9765. ggml_tensor * attn_out = cur;
  9766. // feed-forward network
  9767. {
  9768. cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
  9769. NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR,
  9770. il);
  9771. cb(cur, "ffn_out", il);
  9772. }
  9773. // add together residual + FFN + self-attention
  9774. cur = ggml_add(ctx0, cur, inpL);
  9775. cur = ggml_add(ctx0, cur, attn_out);
  9776. cur = build_cvec(cur, il);
  9777. cb(cur, "l_out", il);
  9778. // input for next layer
  9779. inpL = cur;
  9780. }
  9781. cur = inpL;
  9782. cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1);
  9783. cb(cur, "result_norm", -1);
  9784. res->t_embd = cur;
  9785. // lm_head
  9786. cur = build_lora_mm(model.output, cur);
  9787. if (f_logit_scale) {
  9788. cur = ggml_scale(ctx0, cur, f_logit_scale);
  9789. }
  9790. cb(cur, "result_output", -1);
  9791. res->t_logits = cur;
  9792. ggml_build_forward_expand(gf, cur);
  9793. }
  9794. };
  9795. // ref: https://allenai.org/olmo
  9796. // based on the original build_llama() function, changes:
  9797. // * non-parametric layer norm
  9798. // * clamp qkv
  9799. // * removed bias
  9800. // * removed MoE
  9801. struct llm_build_olmo : public llm_graph_context {
  9802. llm_build_olmo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  9803. const int64_t n_embd_head = hparams.n_embd_head_v;
  9804. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9805. GGML_ASSERT(n_embd_head == hparams.n_rot);
  9806. ggml_tensor * cur;
  9807. ggml_tensor * inpL;
  9808. inpL = build_inp_embd(model.tok_embd);
  9809. // inp_pos - contains the positions
  9810. ggml_tensor * inp_pos = build_inp_pos();
  9811. auto * inp_attn = build_attn_inp_kv();
  9812. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9813. for (int il = 0; il < n_layer; ++il) {
  9814. ggml_tensor * inpSA = inpL;
  9815. // norm
  9816. cur = build_norm(inpL,
  9817. NULL, NULL,
  9818. LLM_NORM, il);
  9819. cb(cur, "attn_norm", il);
  9820. // self-attention
  9821. {
  9822. // compute Q and K and RoPE them
  9823. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  9824. cb(Qcur, "Qcur", il);
  9825. if (hparams.f_clamp_kqv > 0.0f) {
  9826. Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  9827. cb(Qcur, "Qcur", il);
  9828. }
  9829. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  9830. cb(Kcur, "Kcur", il);
  9831. if (hparams.f_clamp_kqv > 0.0f) {
  9832. Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  9833. cb(Kcur, "Kcur", il);
  9834. }
  9835. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  9836. cb(Vcur, "Vcur", il);
  9837. if (hparams.f_clamp_kqv > 0.0f) {
  9838. Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  9839. cb(Vcur, "Vcur", il);
  9840. }
  9841. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9842. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9843. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9844. Qcur = ggml_rope_ext(
  9845. ctx0, Qcur, inp_pos, nullptr,
  9846. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9847. ext_factor, attn_factor, beta_fast, beta_slow
  9848. );
  9849. Kcur = ggml_rope_ext(
  9850. ctx0, Kcur, inp_pos, nullptr,
  9851. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9852. ext_factor, attn_factor, beta_fast, beta_slow
  9853. );
  9854. cb(Qcur, "Qcur", il);
  9855. cb(Kcur, "Kcur", il);
  9856. cb(Vcur, "Vcur", il);
  9857. cur = build_attn(inp_attn,
  9858. model.layers[il].wo, nullptr,
  9859. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  9860. }
  9861. if (il == n_layer - 1 && inp_out_ids) {
  9862. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9863. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9864. }
  9865. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9866. cb(ffn_inp, "ffn_inp", il);
  9867. // feed-forward network
  9868. cur = build_norm(ffn_inp,
  9869. NULL, NULL,
  9870. LLM_NORM, il);
  9871. cb(cur, "ffn_norm", il);
  9872. cur = build_ffn(cur,
  9873. model.layers[il].ffn_up, NULL, NULL,
  9874. model.layers[il].ffn_gate, NULL, NULL,
  9875. model.layers[il].ffn_down, NULL, NULL,
  9876. NULL,
  9877. LLM_FFN_SILU, LLM_FFN_PAR, il);
  9878. cb(cur, "ffn_out", il);
  9879. cur = ggml_add(ctx0, cur, ffn_inp);
  9880. cb(cur, "ffn_out", il);
  9881. cur = build_cvec(cur, il);
  9882. cb(cur, "l_out", il);
  9883. // input for next layer
  9884. inpL = cur;
  9885. }
  9886. cur = inpL;
  9887. cur = build_norm(cur,
  9888. NULL, NULL,
  9889. LLM_NORM, -1);
  9890. cb(cur, "result_norm", -1);
  9891. res->t_embd = cur;
  9892. // lm_head
  9893. cur = build_lora_mm(model.output, cur);
  9894. cb(cur, "result_output", -1);
  9895. res->t_logits = cur;
  9896. ggml_build_forward_expand(gf, cur);
  9897. }
  9898. };
  9899. template <bool iswa>
  9900. struct llm_build_olmo2 : public llm_graph_context {
  9901. llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  9902. const int64_t n_embd_head = hparams.n_embd_head_v;
  9903. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9904. GGML_ASSERT(n_embd_head == hparams.n_rot);
  9905. ggml_tensor * cur;
  9906. ggml_tensor * inpL;
  9907. inpL = build_inp_embd(model.tok_embd);
  9908. // inp_pos - contains the positions
  9909. ggml_tensor * inp_pos = build_inp_pos();
  9910. using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
  9911. inp_attn_type * inp_attn = nullptr;
  9912. if constexpr (iswa) {
  9913. inp_attn = build_attn_inp_kv_iswa();
  9914. } else {
  9915. inp_attn = build_attn_inp_kv();
  9916. }
  9917. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9918. for (int il = 0; il < n_layer; ++il) {
  9919. ggml_tensor * inpSA = inpL;
  9920. cur = inpL;
  9921. // self_attention
  9922. {
  9923. // compute Q and K and RoPE them
  9924. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  9925. cb(Qcur, "Qcur", il);
  9926. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  9927. cb(Kcur, "Kcur", il);
  9928. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  9929. cb(Vcur, "Vcur", il);
  9930. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
  9931. LLM_NORM_RMS, il);
  9932. cb(Qcur, "Qcur_normed", il);
  9933. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
  9934. LLM_NORM_RMS, il);
  9935. cb(Kcur, "Kcur_normed", il);
  9936. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9937. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9938. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9939. const bool is_swa = hparams.is_swa(il);
  9940. if (is_swa) {
  9941. // For sliding window layers, Olmo3 use regular rope with no yarn rope scaling.
  9942. // This is achieved here by setting freq_scale and attn_factor to 1.
  9943. // We also set ext_factor to 0 to avoid a few unnecessary computations.
  9944. Qcur = ggml_rope_ext(
  9945. ctx0, Qcur, inp_pos, nullptr,
  9946. n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
  9947. 0.0, 1.0, beta_fast, beta_slow
  9948. );
  9949. Kcur = ggml_rope_ext(
  9950. ctx0, Kcur, inp_pos, nullptr,
  9951. n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
  9952. 0.0, 1.0, beta_fast, beta_slow
  9953. );
  9954. } else {
  9955. Qcur = ggml_rope_ext(
  9956. ctx0, Qcur, inp_pos, nullptr,
  9957. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9958. ext_factor, attn_factor, beta_fast, beta_slow
  9959. );
  9960. Kcur = ggml_rope_ext(
  9961. ctx0, Kcur, inp_pos, nullptr,
  9962. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9963. ext_factor, attn_factor, beta_fast, beta_slow
  9964. );
  9965. }
  9966. cb(Qcur, "Qcur", il);
  9967. cb(Kcur, "Kcur", il);
  9968. cb(Vcur, "Vcur", il);
  9969. cur = build_attn(inp_attn,
  9970. model.layers[il].wo, NULL,
  9971. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  9972. }
  9973. if (il == n_layer - 1 && inp_out_ids) {
  9974. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9975. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9976. }
  9977. cur = build_norm(cur,
  9978. model.layers[il].attn_post_norm, NULL,
  9979. LLM_NORM_RMS, il);
  9980. cb(cur, "attn_post_norm", il);
  9981. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9982. cb(ffn_inp, "ffn_inp", il);
  9983. // feed-forward network
  9984. cur = build_ffn(ffn_inp,
  9985. model.layers[il].ffn_up, NULL, NULL,
  9986. model.layers[il].ffn_gate, NULL, NULL,
  9987. model.layers[il].ffn_down, NULL, NULL,
  9988. NULL,
  9989. LLM_FFN_SILU, LLM_FFN_PAR, il);
  9990. cb(cur, "ffn_out", il);
  9991. cur = build_norm(cur,
  9992. model.layers[il].ffn_post_norm, NULL,
  9993. LLM_NORM_RMS, -1);
  9994. cb(cur, "ffn_post_norm", -1);
  9995. cur = ggml_add(ctx0, cur, ffn_inp);
  9996. cb(cur, "ffn_out", il);
  9997. cur = build_cvec(cur, il);
  9998. cb(cur, "l_out", il);
  9999. // input for next layer
  10000. inpL = cur;
  10001. }
  10002. cur = inpL;
  10003. cur = build_norm(cur,
  10004. model.output_norm, NULL,
  10005. LLM_NORM_RMS, -1);
  10006. cb(cur, "result_norm", -1);
  10007. res->t_embd = cur;
  10008. // lm_head
  10009. cur = build_lora_mm(model.output, cur);
  10010. cb(cur, "result_output", -1);
  10011. res->t_logits = cur;
  10012. ggml_build_forward_expand(gf, cur);
  10013. }
  10014. };
  10015. // based on the build_qwen2moe() function, changes:
  10016. // * removed shared experts
  10017. // * removed bias
  10018. // * added q, k norm
  10019. struct llm_build_olmoe : public llm_graph_context {
  10020. llm_build_olmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10021. const int64_t n_embd_head = hparams.n_embd_head_v;
  10022. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10023. GGML_ASSERT(n_embd_head == hparams.n_rot);
  10024. ggml_tensor * cur;
  10025. ggml_tensor * inpL;
  10026. inpL = build_inp_embd(model.tok_embd);
  10027. // inp_pos - contains the positions
  10028. ggml_tensor * inp_pos = build_inp_pos();
  10029. auto * inp_attn = build_attn_inp_kv();
  10030. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10031. for (int il = 0; il < n_layer; ++il) {
  10032. ggml_tensor * inpSA = inpL;
  10033. // norm
  10034. cur = build_norm(inpL,
  10035. model.layers[il].attn_norm, NULL,
  10036. LLM_NORM_RMS, il);
  10037. cb(cur, "attn_norm", il);
  10038. // self_attention
  10039. {
  10040. // compute Q and K and RoPE them
  10041. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  10042. cb(Qcur, "Qcur", il);
  10043. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  10044. cb(Kcur, "Kcur", il);
  10045. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  10046. cb(Vcur, "Vcur", il);
  10047. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
  10048. LLM_NORM_RMS, il);
  10049. cb(Qcur, "Qcur_normed", il);
  10050. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
  10051. LLM_NORM_RMS, il);
  10052. cb(Kcur, "Kcur_normed", il);
  10053. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10054. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10055. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  10056. Qcur = ggml_rope_ext(
  10057. ctx0, Qcur, inp_pos, nullptr,
  10058. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10059. ext_factor, attn_factor, beta_fast, beta_slow
  10060. );
  10061. Kcur = ggml_rope_ext(
  10062. ctx0, Kcur, inp_pos, nullptr,
  10063. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10064. ext_factor, attn_factor, beta_fast, beta_slow
  10065. );
  10066. cb(Qcur, "Qcur", il);
  10067. cb(Kcur, "Kcur", il);
  10068. cb(Vcur, "Vcur", il);
  10069. cur = build_attn(inp_attn,
  10070. model.layers[il].wo, NULL,
  10071. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  10072. }
  10073. if (il == n_layer - 1 && inp_out_ids) {
  10074. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10075. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10076. }
  10077. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10078. cb(ffn_inp, "ffn_inp", il);
  10079. // MoE branch
  10080. cur = build_norm(ffn_inp,
  10081. model.layers[il].ffn_norm, NULL,
  10082. LLM_NORM_RMS, il);
  10083. cb(cur, "ffn_norm", il);
  10084. cur = build_moe_ffn(cur,
  10085. model.layers[il].ffn_gate_inp,
  10086. model.layers[il].ffn_up_exps,
  10087. model.layers[il].ffn_gate_exps,
  10088. model.layers[il].ffn_down_exps,
  10089. nullptr,
  10090. n_expert, n_expert_used,
  10091. LLM_FFN_SILU, false,
  10092. false, 0.0,
  10093. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  10094. il);
  10095. cb(cur, "ffn_moe_out", il);
  10096. cur = ggml_add(ctx0, cur, ffn_inp);
  10097. cur = build_cvec(cur, il);
  10098. cb(cur, "l_out", il);
  10099. // input for next layer
  10100. inpL = cur;
  10101. }
  10102. cur = inpL;
  10103. cur = build_norm(cur,
  10104. model.output_norm, NULL,
  10105. LLM_NORM_RMS, -1);
  10106. cb(cur, "result_norm", -1);
  10107. res->t_embd = cur;
  10108. // lm_head
  10109. cur = build_lora_mm(model.output, cur);
  10110. cb(cur, "result_output", -1);
  10111. res->t_logits = cur;
  10112. ggml_build_forward_expand(gf, cur);
  10113. }
  10114. };
  10115. struct llm_build_llada_moe : public llm_graph_context {
  10116. llm_build_llada_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10117. const int64_t n_embd_head = hparams.n_embd_head_v;
  10118. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10119. GGML_ASSERT(n_embd_head == hparams.n_rot);
  10120. ggml_tensor * cur;
  10121. ggml_tensor * inpL;
  10122. inpL = build_inp_embd(model.tok_embd);
  10123. // inp_pos - contains the positions
  10124. ggml_tensor * inp_pos = build_inp_pos();
  10125. auto * inp_attn = build_attn_inp_no_cache();
  10126. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10127. for (int il = 0; il < n_layer; ++il) {
  10128. ggml_tensor * inpSA = inpL;
  10129. // norm
  10130. cur = build_norm(inpL,
  10131. model.layers[il].attn_norm, NULL,
  10132. LLM_NORM_RMS, il);
  10133. cb(cur, "attn_norm", il);
  10134. // self_attention
  10135. {
  10136. // compute Q and K and RoPE them
  10137. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  10138. cb(Qcur, "Qcur", il);
  10139. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  10140. cb(Kcur, "Kcur", il);
  10141. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  10142. cb(Vcur, "Vcur", il);
  10143. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10144. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10145. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  10146. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  10147. cb(Qcur, "Qcur_normed", il);
  10148. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  10149. cb(Kcur, "Kcur_normed", il);
  10150. Qcur = ggml_rope_ext(
  10151. ctx0, Qcur, inp_pos, nullptr,
  10152. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10153. ext_factor, attn_factor, beta_fast, beta_slow
  10154. );
  10155. Kcur = ggml_rope_ext(
  10156. ctx0, Kcur, inp_pos, nullptr,
  10157. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10158. ext_factor, attn_factor, beta_fast, beta_slow
  10159. );
  10160. cb(Qcur, "Qcur", il);
  10161. cb(Kcur, "Kcur", il);
  10162. cb(Vcur, "Vcur", il);
  10163. cur = build_attn(inp_attn,
  10164. model.layers[il].wo, NULL,
  10165. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  10166. }
  10167. if (il == n_layer - 1 && inp_out_ids) {
  10168. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10169. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10170. }
  10171. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10172. cb(ffn_inp, "ffn_inp", il);
  10173. // MoE branch
  10174. cur = build_norm(ffn_inp,
  10175. model.layers[il].ffn_norm, NULL,
  10176. LLM_NORM_RMS, il);
  10177. cb(cur, "ffn_norm", il);
  10178. cur = build_moe_ffn(cur,
  10179. model.layers[il].ffn_gate_inp,
  10180. model.layers[il].ffn_up_exps,
  10181. model.layers[il].ffn_gate_exps,
  10182. model.layers[il].ffn_down_exps,
  10183. nullptr,
  10184. n_expert, n_expert_used,
  10185. LLM_FFN_SILU, false,
  10186. false, 0.0,
  10187. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  10188. il);
  10189. cb(cur, "ffn_moe_out", il);
  10190. cur = ggml_add(ctx0, cur, ffn_inp);
  10191. cur = build_cvec(cur, il);
  10192. cb(cur, "l_out", il);
  10193. // input for next layer
  10194. inpL = cur;
  10195. }
  10196. cur = inpL;
  10197. cur = build_norm(cur,
  10198. model.output_norm, NULL,
  10199. LLM_NORM_RMS, -1);
  10200. cb(cur, "result_norm", -1);
  10201. res->t_embd = cur;
  10202. // lm_head
  10203. cur = build_lora_mm(model.output, cur);
  10204. cb(cur, "result_output", -1);
  10205. res->t_logits = cur;
  10206. ggml_build_forward_expand(gf, cur);
  10207. }
  10208. };
  10209. struct llm_build_openelm : public llm_graph_context {
  10210. llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10211. const int64_t n_embd_head = hparams.n_embd_head_v;
  10212. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10213. ggml_tensor * cur;
  10214. ggml_tensor * inpL;
  10215. inpL = build_inp_embd(model.tok_embd);
  10216. // inp_pos - contains the positions
  10217. ggml_tensor * inp_pos = build_inp_pos();
  10218. auto * inp_attn = build_attn_inp_kv();
  10219. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10220. for (int il = 0; il < n_layer; ++il) {
  10221. const int64_t n_head = hparams.n_head(il);
  10222. const int64_t n_head_kv = hparams.n_head_kv(il);
  10223. const int64_t n_head_qkv = 2*n_head_kv + n_head;
  10224. cur = inpL;
  10225. ggml_tensor * residual = cur;
  10226. // norm
  10227. cur = build_norm(inpL,
  10228. model.layers[il].attn_norm, NULL,
  10229. LLM_NORM_RMS, il);
  10230. cb(cur, "attn_norm", il);
  10231. // self-attention
  10232. {
  10233. cur = build_lora_mm(model.layers[il].wqkv, cur);
  10234. cb(cur, "wqkv", il);
  10235. cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
  10236. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0);
  10237. cb(Qcur, "Qcur", il);
  10238. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head);
  10239. cb(Kcur, "Kcur", il);
  10240. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
  10241. cb(Vcur, "Vcur", il);
  10242. Qcur = build_norm(Qcur,
  10243. model.layers[il].attn_q_norm, NULL,
  10244. LLM_NORM_RMS, il);
  10245. cb(Qcur, "Qcur", il);
  10246. Kcur = build_norm(Kcur,
  10247. model.layers[il].attn_k_norm, NULL,
  10248. LLM_NORM_RMS, il);
  10249. cb(Kcur, "Kcur", il);
  10250. Qcur = ggml_rope_ext(
  10251. ctx0, Qcur, inp_pos, NULL,
  10252. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10253. ext_factor, attn_factor, beta_fast, beta_slow
  10254. );
  10255. Kcur = ggml_rope_ext(
  10256. ctx0, Kcur, inp_pos, NULL,
  10257. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10258. ext_factor, attn_factor, beta_fast, beta_slow
  10259. );
  10260. cb(Qcur, "Qcur", il);
  10261. cb(Kcur, "Kcur", il);
  10262. cb(Qcur, "Vcur", il);
  10263. cur = build_attn(inp_attn,
  10264. model.layers[il].wo, NULL,
  10265. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  10266. }
  10267. if (il == n_layer - 1 && inp_out_ids) {
  10268. residual = ggml_get_rows(ctx0, residual, inp_out_ids);
  10269. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10270. }
  10271. ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
  10272. cb(ffn_inp, "ffn_inp", il);
  10273. // feed-forward network
  10274. {
  10275. cur = build_norm(ffn_inp,
  10276. model.layers[il].ffn_norm, NULL,
  10277. LLM_NORM_RMS, il);
  10278. cb(cur, "ffn_norm", il);
  10279. cur = build_ffn(cur,
  10280. model.layers[il].ffn_up, NULL, NULL,
  10281. model.layers[il].ffn_gate, NULL, NULL,
  10282. model.layers[il].ffn_down, NULL, NULL,
  10283. NULL,
  10284. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10285. cb(cur, "ffn_out", il);
  10286. }
  10287. cur = ggml_add(ctx0, cur, ffn_inp);
  10288. cur = build_cvec(cur, il);
  10289. cb(cur, "l_out", il);
  10290. inpL = cur;
  10291. }
  10292. cur = inpL;
  10293. // norm
  10294. cur = build_norm(cur,
  10295. model.output_norm, NULL,
  10296. LLM_NORM_RMS, -1);
  10297. cb(cur, "result_norm", -1);
  10298. res->t_embd = cur;
  10299. cur = build_lora_mm(model.output, cur);
  10300. cb(cur, "result_output", -1);
  10301. res->t_logits = cur;
  10302. ggml_build_forward_expand(gf, cur);
  10303. }
  10304. };
  10305. struct llm_build_gptneox : public llm_graph_context {
  10306. llm_build_gptneox(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10307. const int64_t n_embd_head = hparams.n_embd_head_v;
  10308. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  10309. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10310. ggml_tensor * cur;
  10311. ggml_tensor * inpL;
  10312. inpL = build_inp_embd(model.tok_embd);
  10313. // inp_pos - contains the positions
  10314. ggml_tensor * inp_pos = build_inp_pos();
  10315. auto * inp_attn = build_attn_inp_kv();
  10316. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10317. for (int il = 0; il < n_layer; ++il) {
  10318. cur = build_norm(inpL,
  10319. model.layers[il].attn_norm,
  10320. model.layers[il].attn_norm_b,
  10321. LLM_NORM, il);
  10322. cb(cur, "attn_norm", il);
  10323. // self-attention
  10324. {
  10325. cur = build_lora_mm(model.layers[il].wqkv, cur);
  10326. cb(cur, "wqkv", il);
  10327. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  10328. cb(cur, "bqkv", il);
  10329. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  10330. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  10331. ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  10332. Qcur = ggml_rope_ext(
  10333. ctx0, Qcur, inp_pos, nullptr,
  10334. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10335. ext_factor, attn_factor, beta_fast, beta_slow
  10336. );
  10337. Kcur = ggml_rope_ext(
  10338. ctx0, Kcur, inp_pos, nullptr,
  10339. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10340. ext_factor, attn_factor, beta_fast, beta_slow
  10341. );
  10342. cb(Qcur, "Qcur", il);
  10343. cb(Kcur, "Kcur", il);
  10344. cb(Vcur, "Vcur", il);
  10345. cur = build_attn(inp_attn,
  10346. model.layers[il].wo, model.layers[il].bo,
  10347. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  10348. }
  10349. if (il == n_layer - 1 && inp_out_ids) {
  10350. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10351. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  10352. }
  10353. // ffn
  10354. if (hparams.use_par_res) {
  10355. // attention and ffn are computed in parallel
  10356. // x = x + attn(ln1(x)) + ffn(ln2(x))
  10357. ggml_tensor * attn_out = cur;
  10358. cur = build_norm(inpL,
  10359. model.layers[il].ffn_norm,
  10360. model.layers[il].ffn_norm_b,
  10361. LLM_NORM, il);
  10362. cb(cur, "ffn_norm", il);
  10363. cur = build_ffn(cur,
  10364. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  10365. NULL, NULL, NULL,
  10366. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  10367. NULL,
  10368. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  10369. cb(cur, "ffn_out", il);
  10370. cur = ggml_add(ctx0, cur, inpL);
  10371. cb(cur, "ffn_out", il);
  10372. cur = ggml_add(ctx0, cur, attn_out);
  10373. cur = build_cvec(cur, il);
  10374. cb(cur, "l_out", il);
  10375. // input for next layer
  10376. inpL = cur;
  10377. } else {
  10378. // attention and ffn are computed sequentially
  10379. // x = x + attn(ln1(x))
  10380. // x = x + ffn(ln2(x))
  10381. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  10382. cb(ffn_inp, "ffn_inp", il);
  10383. cur = build_norm(ffn_inp,
  10384. model.layers[il].ffn_norm,
  10385. model.layers[il].ffn_norm_b,
  10386. LLM_NORM, il);
  10387. cb(cur, "ffn_norm", il);
  10388. cur = build_ffn(cur,
  10389. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  10390. NULL, NULL, NULL,
  10391. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  10392. NULL,
  10393. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  10394. cb(cur, "ffn_out", il);
  10395. cur = ggml_add(ctx0, cur, ffn_inp);
  10396. cur = build_cvec(cur, il);
  10397. cb(cur, "l_out", il);
  10398. // input for next layer
  10399. inpL = cur;
  10400. }
  10401. }
  10402. cur = build_norm(inpL,
  10403. model.output_norm,
  10404. model.output_norm_b,
  10405. LLM_NORM, -1);
  10406. cb(cur, "result_norm", -1);
  10407. res->t_embd = cur;
  10408. cur = build_lora_mm(model.output, cur);
  10409. cb(cur, "result_output", -1);
  10410. res->t_logits = cur;
  10411. ggml_build_forward_expand(gf, cur);
  10412. }
  10413. };
  10414. struct llm_build_arctic : public llm_graph_context {
  10415. llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10416. const int64_t n_embd_head = hparams.n_embd_head_v;
  10417. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10418. GGML_ASSERT(n_embd_head == hparams.n_rot);
  10419. ggml_tensor * cur;
  10420. ggml_tensor * inpL;
  10421. inpL = build_inp_embd(model.tok_embd);
  10422. // inp_pos - contains the positions
  10423. ggml_tensor * inp_pos = build_inp_pos();
  10424. auto * inp_attn = build_attn_inp_kv();
  10425. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10426. for (int il = 0; il < n_layer; ++il) {
  10427. ggml_tensor * inpSA = inpL;
  10428. // norm
  10429. cur = build_norm(inpL,
  10430. model.layers[il].attn_norm, NULL,
  10431. LLM_NORM_RMS, il);
  10432. cb(cur, "attn_norm", il);
  10433. // self-attention
  10434. {
  10435. // compute Q and K and RoPE them
  10436. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  10437. cb(Qcur, "Qcur", il);
  10438. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  10439. cb(Kcur, "Kcur", il);
  10440. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  10441. cb(Vcur, "Vcur", il);
  10442. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10443. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10444. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  10445. Qcur = ggml_rope_ext(
  10446. ctx0, Qcur, inp_pos, nullptr,
  10447. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10448. ext_factor, attn_factor, beta_fast, beta_slow
  10449. );
  10450. Kcur = ggml_rope_ext(
  10451. ctx0, Kcur, inp_pos, nullptr,
  10452. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10453. ext_factor, attn_factor, beta_fast, beta_slow
  10454. );
  10455. cb(Qcur, "Qcur", il);
  10456. cb(Kcur, "Kcur", il);
  10457. cb(Vcur, "Vcur", il);
  10458. cur = build_attn(inp_attn,
  10459. model.layers[il].wo, NULL,
  10460. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  10461. }
  10462. if (il == n_layer - 1 && inp_out_ids) {
  10463. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10464. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10465. }
  10466. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10467. cb(ffn_inp, "ffn_inp", il);
  10468. // feed-forward network
  10469. cur = build_norm(ffn_inp,
  10470. model.layers[il].ffn_norm, NULL,
  10471. LLM_NORM_RMS, il);
  10472. cb(cur, "ffn_norm", il);
  10473. cur = build_ffn(cur,
  10474. model.layers[il].ffn_up, NULL, NULL,
  10475. model.layers[il].ffn_gate, NULL, NULL,
  10476. model.layers[il].ffn_down, NULL, NULL,
  10477. NULL,
  10478. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10479. cb(cur, "ffn_out", il);
  10480. ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
  10481. cb(ffn_out, "ffn_out", il);
  10482. // MoE
  10483. cur = build_norm(inpSA,
  10484. model.layers[il].ffn_norm_exps, NULL,
  10485. LLM_NORM_RMS, il);
  10486. cb(cur, "ffn_norm_exps", il);
  10487. cur = build_moe_ffn(cur,
  10488. model.layers[il].ffn_gate_inp,
  10489. model.layers[il].ffn_up_exps,
  10490. model.layers[il].ffn_gate_exps,
  10491. model.layers[il].ffn_down_exps,
  10492. nullptr,
  10493. n_expert, n_expert_used,
  10494. LLM_FFN_SILU, true,
  10495. false, 0.0,
  10496. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  10497. il);
  10498. cb(cur, "ffn_moe_out", il);
  10499. cur = ggml_add(ctx0, cur, ffn_out);
  10500. cb(cur, "ffn_out", il);
  10501. cur = build_cvec(cur, il);
  10502. cb(cur, "l_out", il);
  10503. // input for next layer
  10504. inpL = cur;
  10505. }
  10506. cur = inpL;
  10507. cur = build_norm(cur,
  10508. model.output_norm, NULL,
  10509. LLM_NORM_RMS, -1);
  10510. cb(cur, "result_norm", -1);
  10511. res->t_embd = cur;
  10512. // lm_head
  10513. cur = build_lora_mm(model.output, cur);
  10514. cb(cur, "result_output", -1);
  10515. res->t_logits = cur;
  10516. ggml_build_forward_expand(gf, cur);
  10517. }
  10518. };
  10519. struct llm_build_deepseek : public llm_graph_context {
  10520. llm_build_deepseek(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10521. const int64_t n_embd_head = hparams.n_embd_head_v;
  10522. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10523. GGML_ASSERT(n_embd_head == hparams.n_rot);
  10524. ggml_tensor * cur;
  10525. ggml_tensor * inpL;
  10526. inpL = build_inp_embd(model.tok_embd);
  10527. // inp_pos - contains the positions
  10528. ggml_tensor * inp_pos = build_inp_pos();
  10529. auto * inp_attn = build_attn_inp_kv();
  10530. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  10531. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10532. for (int il = 0; il < n_layer; ++il) {
  10533. ggml_tensor * inpSA = inpL;
  10534. // norm
  10535. cur = build_norm(inpL,
  10536. model.layers[il].attn_norm, NULL,
  10537. LLM_NORM_RMS, il);
  10538. cb(cur, "attn_norm", il);
  10539. // self-attention
  10540. {
  10541. // rope freq factors for llama3; may return nullptr for llama2 and other models
  10542. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  10543. // compute Q and K and RoPE them
  10544. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  10545. cb(Qcur, "Qcur", il);
  10546. if (model.layers[il].bq) {
  10547. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  10548. cb(Qcur, "Qcur", il);
  10549. }
  10550. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  10551. cb(Kcur, "Kcur", il);
  10552. if (model.layers[il].bk) {
  10553. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  10554. cb(Kcur, "Kcur", il);
  10555. }
  10556. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  10557. cb(Vcur, "Vcur", il);
  10558. if (model.layers[il].bv) {
  10559. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  10560. cb(Vcur, "Vcur", il);
  10561. }
  10562. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10563. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10564. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  10565. Qcur = ggml_rope_ext(
  10566. ctx0, Qcur, inp_pos, rope_factors,
  10567. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10568. ext_factor, attn_factor, beta_fast, beta_slow
  10569. );
  10570. Kcur = ggml_rope_ext(
  10571. ctx0, Kcur, inp_pos, rope_factors,
  10572. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10573. ext_factor, attn_factor, beta_fast, beta_slow
  10574. );
  10575. cb(Qcur, "Qcur", il);
  10576. cb(Kcur, "Kcur", il);
  10577. cb(Vcur, "Vcur", il);
  10578. cur = build_attn(inp_attn,
  10579. model.layers[il].wo, model.layers[il].bo,
  10580. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  10581. }
  10582. if (il == n_layer - 1 && inp_out_ids) {
  10583. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10584. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10585. }
  10586. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10587. cb(ffn_inp, "ffn_inp", il);
  10588. cur = build_norm(ffn_inp,
  10589. model.layers[il].ffn_norm, NULL,
  10590. LLM_NORM_RMS, il);
  10591. cb(cur, "ffn_norm", il);
  10592. if ((uint32_t) il < hparams.n_layer_dense_lead) {
  10593. cur = build_ffn(cur,
  10594. model.layers[il].ffn_up, NULL, NULL,
  10595. model.layers[il].ffn_gate, NULL, NULL,
  10596. model.layers[il].ffn_down, NULL, NULL,
  10597. NULL,
  10598. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10599. cb(cur, "ffn_out", il);
  10600. } else {
  10601. // MoE branch
  10602. ggml_tensor * moe_out =
  10603. build_moe_ffn(cur,
  10604. model.layers[il].ffn_gate_inp,
  10605. model.layers[il].ffn_up_exps,
  10606. model.layers[il].ffn_gate_exps,
  10607. model.layers[il].ffn_down_exps,
  10608. nullptr,
  10609. n_expert, n_expert_used,
  10610. LLM_FFN_SILU, false,
  10611. false, hparams.expert_weights_scale,
  10612. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  10613. il);
  10614. cb(moe_out, "ffn_moe_out", il);
  10615. // FFN shared expert
  10616. {
  10617. ggml_tensor * ffn_shexp = build_ffn(cur,
  10618. model.layers[il].ffn_up_shexp, NULL, NULL,
  10619. model.layers[il].ffn_gate_shexp, NULL, NULL,
  10620. model.layers[il].ffn_down_shexp, NULL, NULL,
  10621. NULL,
  10622. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10623. cb(ffn_shexp, "ffn_shexp", il);
  10624. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  10625. cb(cur, "ffn_out", il);
  10626. }
  10627. }
  10628. cur = ggml_add(ctx0, cur, ffn_inp);
  10629. cur = build_cvec(cur, il);
  10630. cb(cur, "l_out", il);
  10631. // input for next layer
  10632. inpL = cur;
  10633. }
  10634. cur = inpL;
  10635. cur = build_norm(cur,
  10636. model.output_norm, NULL,
  10637. LLM_NORM_RMS, -1);
  10638. cb(cur, "result_norm", -1);
  10639. res->t_embd = cur;
  10640. // lm_head
  10641. cur = build_lora_mm(model.output, cur);
  10642. cb(cur, "result_output", -1);
  10643. res->t_logits = cur;
  10644. ggml_build_forward_expand(gf, cur);
  10645. }
  10646. };
  10647. struct llm_build_deepseek2 : public llm_graph_context {
  10648. llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10649. bool is_lite = (hparams.n_layer == 27);
  10650. const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
  10651. // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
  10652. const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
  10653. const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
  10654. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  10655. const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
  10656. const uint32_t kv_lora_rank = hparams.n_lora_kv;
  10657. // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
  10658. // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
  10659. const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
  10660. const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(n_embd_head_k));
  10661. const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
  10662. ggml_tensor * cur;
  10663. ggml_tensor * inpL;
  10664. // {n_embd, n_tokens}
  10665. inpL = build_inp_embd(model.tok_embd);
  10666. // inp_pos - contains the positions
  10667. ggml_tensor * inp_pos = build_inp_pos();
  10668. auto * inp_attn = build_attn_inp_kv();
  10669. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10670. for (int il = 0; il < n_layer; ++il) {
  10671. ggml_tensor * inpSA = inpL;
  10672. // norm
  10673. cur = build_norm(inpL,
  10674. model.layers[il].attn_norm, NULL,
  10675. LLM_NORM_RMS, il);
  10676. cb(cur, "attn_norm", il);
  10677. // self_attention
  10678. {
  10679. ggml_tensor * q = NULL;
  10680. if (!is_lite) {
  10681. q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
  10682. cb(q, "q", il);
  10683. q = build_norm(q,
  10684. model.layers[il].attn_q_a_norm, nullptr,
  10685. LLM_NORM_RMS, il);
  10686. cb(q, "q", il);
  10687. q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
  10688. cb(q, "q", il);
  10689. } else {
  10690. q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
  10691. cb(q, "q", il);
  10692. }
  10693. // split into {n_embd_head_qk_nope, n_head, n_tokens}
  10694. ggml_tensor * q_nope = ggml_view_3d(ctx0, q,
  10695. n_embd_head_qk_nope, n_head, n_tokens,
  10696. ggml_row_size(q->type, n_embd_head_k),
  10697. ggml_row_size(q->type, n_embd_head_k) * n_head,
  10698. 0);
  10699. cb(q_nope, "q_nope", il);
  10700. // and {n_embd_head_qk_rope, n_head, n_tokens}
  10701. ggml_tensor * q_pe = ggml_view_3d(ctx0, q,
  10702. n_embd_head_qk_rope, n_head, n_tokens,
  10703. ggml_row_size(q->type, n_embd_head_k),
  10704. ggml_row_size(q->type, n_embd_head_k) * n_head,
  10705. ggml_row_size(q->type, n_embd_head_qk_nope));
  10706. cb(q_pe, "q_pe", il);
  10707. ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
  10708. cb(kv_cmpr_pe, "kv_cmpr_pe", il);
  10709. // split into {kv_lora_rank, n_tokens}
  10710. ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_cmpr_pe,
  10711. kv_lora_rank, n_tokens,
  10712. ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
  10713. 0);
  10714. cb(kv_cmpr, "kv_cmpr", il);
  10715. // and {n_embd_head_qk_rope, 1, n_tokens}
  10716. ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe,
  10717. n_embd_head_qk_rope, 1, n_tokens,
  10718. ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
  10719. ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
  10720. ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
  10721. cb(k_pe, "k_pe", il);
  10722. q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr,
  10723. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10724. ext_factor, attn_factor, beta_fast, beta_slow
  10725. );
  10726. cb(q_pe, "q_pe", il);
  10727. k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr,
  10728. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10729. ext_factor, attn_factor, beta_fast, beta_slow
  10730. );
  10731. cb(k_pe, "k_pe", il);
  10732. kv_cmpr = build_norm(kv_cmpr,
  10733. model.layers[il].attn_kv_a_norm, nullptr,
  10734. LLM_NORM_RMS, il);
  10735. cb(kv_cmpr, "kv_cmpr", il);
  10736. if (is_mla) {
  10737. // {n_embd_head_qk_nope, n_tokens, n_head}
  10738. q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
  10739. cb(q_nope, "q_nope_perm", il);
  10740. // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
  10741. ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
  10742. cb(q_nope_absorbed, "q_nope_absorbed", il);
  10743. // {kv_lora_rank, n_head, n_tokens}
  10744. q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
  10745. cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
  10746. // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
  10747. // note: rope must go first for in-place context shifting in build_rope_shift()
  10748. ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
  10749. cb(Qcur, "Qcur", il);
  10750. kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
  10751. cb(kv_cmpr, "kv_cmpr_reshape", il);
  10752. // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
  10753. ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
  10754. cb(Kcur, "Kcur", il);
  10755. // {kv_lora_rank, 1, n_tokens}
  10756. ggml_tensor * Vcur = kv_cmpr;
  10757. cb(Vcur, "Vcur", il);
  10758. // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
  10759. cur = build_attn(inp_attn,
  10760. model.layers[il].wo, NULL,
  10761. Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
  10762. } else {
  10763. ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
  10764. cb(kv, "kv", il);
  10765. // split into {n_embd_head_qk_nope, n_head, n_tokens}
  10766. ggml_tensor * k_nope = ggml_view_3d(ctx0, kv,
  10767. n_embd_head_qk_nope, n_head, n_tokens,
  10768. ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
  10769. ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
  10770. 0);
  10771. cb(k_nope, "k_nope_view", il);
  10772. // and {n_embd_head_v, n_head, n_tokens}
  10773. ggml_tensor * Vcur = ggml_view_3d(ctx0, kv,
  10774. n_embd_head_v, n_head, n_tokens,
  10775. ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
  10776. ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
  10777. ggml_row_size(kv->type, n_embd_head_qk_nope));
  10778. cb(Vcur, "Vcur_view", il);
  10779. Vcur = ggml_cont(ctx0, Vcur);
  10780. cb(Vcur, "Vcur_cont", il);
  10781. // note: rope must go first for in-place context shifting in build_rope_shift()
  10782. ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0);
  10783. cb(Qcur, "Qcur", il);
  10784. ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
  10785. cb(Kcur, "Kcur", il);
  10786. // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
  10787. cur = build_attn(inp_attn,
  10788. model.layers[il].wo, NULL,
  10789. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  10790. }
  10791. }
  10792. if (il == n_layer - 1 && inp_out_ids) {
  10793. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10794. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10795. }
  10796. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10797. cb(ffn_inp, "ffn_inp", il);
  10798. cur = build_norm(ffn_inp,
  10799. model.layers[il].ffn_norm, NULL,
  10800. LLM_NORM_RMS, il);
  10801. cb(cur, "ffn_norm", il);
  10802. if ((uint32_t) il < hparams.n_layer_dense_lead) {
  10803. cur = build_ffn(cur,
  10804. model.layers[il].ffn_up, NULL, NULL,
  10805. model.layers[il].ffn_gate, NULL, NULL,
  10806. model.layers[il].ffn_down, NULL, NULL,
  10807. NULL,
  10808. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10809. cb(cur, "ffn_out", il);
  10810. } else {
  10811. // MoE branch
  10812. ggml_tensor * moe_out =
  10813. build_moe_ffn(cur,
  10814. model.layers[il].ffn_gate_inp,
  10815. model.layers[il].ffn_up_exps,
  10816. model.layers[il].ffn_gate_exps,
  10817. model.layers[il].ffn_down_exps,
  10818. model.layers[il].ffn_exp_probs_b,
  10819. n_expert, n_expert_used,
  10820. LLM_FFN_SILU, hparams.expert_weights_norm,
  10821. true, hparams.expert_weights_scale,
  10822. (llama_expert_gating_func_type) hparams.expert_gating_func,
  10823. il);
  10824. cb(moe_out, "ffn_moe_out", il);
  10825. // FFN shared expert
  10826. {
  10827. ggml_tensor * ffn_shexp = build_ffn(cur,
  10828. model.layers[il].ffn_up_shexp, NULL, NULL,
  10829. model.layers[il].ffn_gate_shexp, NULL, NULL,
  10830. model.layers[il].ffn_down_shexp, NULL, NULL,
  10831. NULL,
  10832. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10833. cb(ffn_shexp, "ffn_shexp", il);
  10834. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  10835. cb(cur, "ffn_out", il);
  10836. }
  10837. }
  10838. cur = ggml_add(ctx0, cur, ffn_inp);
  10839. cur = build_cvec(cur, il);
  10840. cb(cur, "l_out", il);
  10841. // input for next layer
  10842. inpL = cur;
  10843. }
  10844. cur = inpL;
  10845. cur = build_norm(cur,
  10846. model.output_norm, NULL,
  10847. LLM_NORM_RMS, -1);
  10848. cb(cur, "result_norm", -1);
  10849. res->t_embd = cur;
  10850. // lm_head
  10851. cur = ggml_mul_mat(ctx0, model.output, cur);
  10852. cb(cur, "result_output", -1);
  10853. res->t_logits = cur;
  10854. ggml_build_forward_expand(gf, cur);
  10855. }
  10856. };
  10857. struct llm_build_bitnet : public llm_graph_context {
  10858. llm_build_bitnet(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10859. const int64_t n_embd_head = hparams.n_embd_head_v;
  10860. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10861. ggml_tensor * cur;
  10862. ggml_tensor * inpL;
  10863. inpL = build_inp_embd(model.tok_embd);
  10864. // inp_pos - contains the positions
  10865. ggml_tensor * inp_pos = build_inp_pos();
  10866. auto * inp_attn = build_attn_inp_kv();
  10867. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10868. for (int il = 0; il < n_layer; ++il) {
  10869. ggml_tensor * inpSA = inpL;
  10870. cur = build_norm(inpL,
  10871. model.layers[il].attn_norm, NULL,
  10872. LLM_NORM_RMS, il);
  10873. cb(cur, "attn_norm", il);
  10874. // self-attention
  10875. {
  10876. // compute Q and K and RoPE them
  10877. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  10878. if (model.layers[il].wq_scale) {
  10879. Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
  10880. }
  10881. cb(Qcur, "Qcur", il);
  10882. if (model.layers[il].bq) {
  10883. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  10884. cb(Qcur, "Qcur", il);
  10885. }
  10886. // B1.K
  10887. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  10888. if (model.layers[il].wk_scale) {
  10889. Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
  10890. }
  10891. cb(Kcur, "Kcur", il);
  10892. if (model.layers[il].bk) {
  10893. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  10894. cb(Kcur, "Kcur", il);
  10895. }
  10896. // B1.V
  10897. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  10898. if (model.layers[il].wv_scale) {
  10899. Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
  10900. }
  10901. cb(Vcur, "Vcur", il);
  10902. if (model.layers[il].bv) {
  10903. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  10904. cb(Vcur, "Vcur", il);
  10905. }
  10906. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10907. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10908. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  10909. Qcur = ggml_rope_ext(
  10910. ctx0, Qcur, inp_pos, nullptr,
  10911. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10912. ext_factor, attn_factor, beta_fast, beta_slow
  10913. );
  10914. Kcur = ggml_rope_ext(
  10915. ctx0, Kcur, inp_pos, nullptr,
  10916. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10917. ext_factor, attn_factor, beta_fast, beta_slow
  10918. );
  10919. cb(Qcur, "Qcur", il);
  10920. cb(Kcur, "Kcur", il);
  10921. cb(Vcur, "Vcur", il);
  10922. cur = build_attn(inp_attn,
  10923. NULL, NULL,
  10924. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  10925. cur = build_norm(cur,
  10926. model.layers[il].attn_sub_norm, NULL,
  10927. LLM_NORM_RMS, il);
  10928. cb(cur, "attn_sub_norm", il);
  10929. cur = build_lora_mm(model.layers[il].wo, cur);
  10930. if (model.layers[il].wo_scale) {
  10931. cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
  10932. }
  10933. if (model.layers[il].bo) {
  10934. cur = ggml_add(ctx0, cur, model.layers[il].bo);
  10935. }
  10936. cb(cur, "attn_o_out", il);
  10937. }
  10938. if (il == n_layer - 1 && inp_out_ids) {
  10939. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10940. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10941. }
  10942. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10943. cb(ffn_inp, "ffn_inp", il);
  10944. // feed-forward forward
  10945. cur = build_norm(ffn_inp,
  10946. model.layers[il].ffn_norm, NULL,
  10947. LLM_NORM_RMS, il);
  10948. cb(cur, "ffn_norm", il);
  10949. cur = build_ffn(cur,
  10950. model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale,
  10951. model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
  10952. NULL, NULL, NULL,
  10953. NULL,
  10954. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10955. cb(cur, "ffn_sub_out", il);
  10956. cur = build_norm(cur,
  10957. model.layers[il].ffn_sub_norm, NULL,
  10958. LLM_NORM_RMS, il);
  10959. cb(cur, "ffn_sub_norm", il);
  10960. cur = build_lora_mm(model.layers[il].ffn_down, cur);
  10961. if (model.layers[il].ffn_down_scale) {
  10962. cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
  10963. }
  10964. cb(cur, "ffn_down", il);
  10965. cur = ggml_add(ctx0, cur, ffn_inp);
  10966. cb(cur, "l_out", il);
  10967. // input for next layer
  10968. inpL = cur;
  10969. }
  10970. cur = inpL;
  10971. cur = build_norm(cur,
  10972. model.output_norm, NULL,
  10973. LLM_NORM_RMS, -1);
  10974. cb(cur, "result_norm", -1);
  10975. res->t_embd = cur;
  10976. // lm_head
  10977. // FIXME: do not use model.tok_embd directly, duplicate as model.output
  10978. cur = build_lora_mm(model.tok_embd, cur);
  10979. cb(cur, "result_output", -1);
  10980. res->t_logits = cur;
  10981. ggml_build_forward_expand(gf, cur);
  10982. }
  10983. };
  10984. struct llm_build_t5_enc : public llm_graph_context {
  10985. llm_build_t5_enc(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10986. const int64_t n_embd_head = hparams.n_embd_head_v;
  10987. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10988. ggml_tensor * cur;
  10989. ggml_tensor * inpL;
  10990. inpL = build_inp_embd(model.tok_embd);
  10991. ggml_tensor * pos_bucket_enc = build_inp_pos_bucket_enc();
  10992. auto * inp_attn = build_attn_inp_no_cache();
  10993. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10994. for (int il = 0; il < n_layer; ++il) {
  10995. ggml_tensor * inpSA = inpL;
  10996. // norm
  10997. cur = build_norm(inpL,
  10998. model.layers[il].attn_norm_enc, NULL,
  10999. LLM_NORM_RMS, il);
  11000. cb(cur, "attn_norm", il);
  11001. // self-attention
  11002. {
  11003. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur);
  11004. cb(Qcur, "Qcur", il);
  11005. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur);
  11006. cb(Kcur, "Kcur", il);
  11007. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur);
  11008. cb(Vcur, "Vcur", il);
  11009. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  11010. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  11011. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  11012. ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
  11013. ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b);
  11014. cur = build_attn(inp_attn,
  11015. model.layers[il].wo_enc, nullptr,
  11016. Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
  11017. cb(cur, "kqv_out", il);
  11018. }
  11019. if (il == n_layer - 1 && inp_out_ids) {
  11020. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11021. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  11022. }
  11023. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  11024. cb(ffn_inp, "ffn_inp", il);
  11025. // feed-forward network
  11026. {
  11027. cur = build_norm(ffn_inp,
  11028. model.layers[il].ffn_norm_enc, NULL,
  11029. LLM_NORM_RMS, il);
  11030. cb(cur, "ffn_norm", il);
  11031. // T5 uses relu, flan-T5 uses gelu-gated
  11032. cur = build_ffn(cur,
  11033. model.layers[il].ffn_up_enc, NULL, NULL,
  11034. model.layers[il].ffn_gate_enc, NULL, NULL,
  11035. model.layers[il].ffn_down_enc, NULL, NULL,
  11036. NULL,
  11037. model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
  11038. model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
  11039. il);
  11040. cb(cur, "ffn_out", il);
  11041. }
  11042. cur = ggml_add(ctx0, cur, ffn_inp);
  11043. cb(cur, "ffn_out", il);
  11044. cur = build_cvec(cur, il);
  11045. cb(cur, "l_out", il);
  11046. // input for next layer
  11047. inpL = cur;
  11048. }
  11049. cur = inpL;
  11050. cb(cur, "result_embd", -1);
  11051. cur = build_norm(cur,
  11052. model.output_norm_enc, NULL,
  11053. LLM_NORM_RMS, -1);
  11054. cb(cur, "result_norm", -1);
  11055. res->t_embd = cur;
  11056. ggml_build_forward_expand(gf, cur);
  11057. }
  11058. };
  11059. struct llm_build_t5_dec : public llm_graph_context {
  11060. llm_build_t5_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  11061. const int64_t n_embd_head = hparams.n_embd_head_v;
  11062. //const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  11063. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  11064. ggml_tensor * cur;
  11065. ggml_tensor * inpL;
  11066. inpL = build_inp_embd(model.tok_embd);
  11067. ggml_tensor * embd_enc = build_inp_cross_embd();
  11068. ggml_tensor * pos_bucket_dec = build_inp_pos_bucket_dec();
  11069. const int64_t n_outputs_enc = embd_enc->ne[1];
  11070. auto * inp_attn_self = build_attn_inp_kv();
  11071. auto * inp_attn_cross = build_attn_inp_cross();
  11072. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11073. const int64_t dec_n_layer = hparams.dec_n_layer;
  11074. for (int il = 0; il < dec_n_layer; ++il) {
  11075. ggml_tensor * inpSA = inpL;
  11076. // norm
  11077. cur = build_norm(inpL,
  11078. model.layers[il].attn_norm, NULL,
  11079. LLM_NORM_RMS, il);
  11080. cb(cur, "attn_norm", il);
  11081. // self-attention
  11082. {
  11083. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  11084. cb(Qcur, "Qcur", il);
  11085. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  11086. cb(Kcur, "Kcur", il);
  11087. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  11088. cb(Vcur, "Vcur", il);
  11089. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  11090. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  11091. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  11092. ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
  11093. ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b);
  11094. cur = build_attn(inp_attn_self,
  11095. model.layers[il].wo, model.layers[il].bo,
  11096. Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
  11097. cb(cur, "kqv_out", il);
  11098. }
  11099. cur = ggml_add(ctx0, cur, inpSA);
  11100. cb(cur, "cross_inp", il);
  11101. ggml_tensor * inpCA = cur;
  11102. // norm
  11103. cur = build_norm(cur,
  11104. model.layers[il].attn_norm_cross, NULL,
  11105. LLM_NORM_RMS, il);
  11106. cb(cur, "attn_norm_cross", il);
  11107. // cross-attention
  11108. {
  11109. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur);
  11110. cb(Qcur, "Qcur", il);
  11111. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc);
  11112. cb(Kcur, "Kcur", il);
  11113. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc);
  11114. cb(Vcur, "Vcur", il);
  11115. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  11116. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
  11117. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_outputs_enc);
  11118. cur = build_attn(inp_attn_cross,
  11119. model.layers[il].wo_cross, nullptr,
  11120. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
  11121. cb(cur, "kqv_out", il);
  11122. //ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
  11123. //ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
  11124. //ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
  11125. //cb(kq, "kq", il);
  11126. //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
  11127. //cb(kq, "kq_soft_max_ext", il);
  11128. //ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
  11129. //cb(v, "v", il);
  11130. //ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
  11131. //cb(kqv, "kqv", il);
  11132. //ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
  11133. //cb(kqv_merged, "kqv_merged", il);
  11134. //cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
  11135. //cb(cur, "kqv_merged_cont", il);
  11136. //ggml_build_forward_expand(gf, cur);
  11137. //cur = build_lora_mm(model.layers[il].wo_cross, cur);
  11138. //cb(cur, "kqv_out", il);
  11139. }
  11140. if (il == dec_n_layer - 1 && inp_out_ids) {
  11141. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11142. inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
  11143. }
  11144. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
  11145. cb(ffn_inp, "ffn_inp", il);
  11146. // feed-forward network
  11147. {
  11148. cur = build_norm(ffn_inp,
  11149. model.layers[il].ffn_norm, NULL,
  11150. LLM_NORM_RMS, il);
  11151. cb(cur, "ffn_norm", il);
  11152. // T5 uses relu, flan-T5 uses gelu-gated
  11153. cur = build_ffn(cur,
  11154. model.layers[il].ffn_up, NULL, NULL,
  11155. model.layers[il].ffn_gate, NULL, NULL,
  11156. model.layers[il].ffn_down, NULL, NULL,
  11157. NULL,
  11158. model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_RELU,
  11159. model.layers[il].ffn_gate ? LLM_FFN_PAR : LLM_FFN_SEQ,
  11160. il);
  11161. cb(cur, "ffn_out", il);
  11162. }
  11163. cur = ggml_add(ctx0, cur, ffn_inp);
  11164. cb(cur, "ffn_out", il);
  11165. cur = build_cvec(cur, il);
  11166. cb(cur, "l_out", il);
  11167. // input for next layer
  11168. inpL = cur;
  11169. }
  11170. cur = inpL;
  11171. cb(cur, "result_embd", -1);
  11172. cur = build_norm(cur,
  11173. model.output_norm, NULL,
  11174. LLM_NORM_RMS, -1);
  11175. cb(cur, "result_norm", -1);
  11176. res->t_embd = cur;
  11177. // lm_head
  11178. cur = build_lora_mm(model.output, cur);
  11179. cb(cur, "result_output", -1);
  11180. res->t_logits = cur;
  11181. ggml_build_forward_expand(gf, cur);
  11182. }
  11183. };
  11184. struct llm_build_jais : public llm_graph_context {
  11185. llm_build_jais(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  11186. const int64_t n_embd_head = hparams.n_embd_head_v;
  11187. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  11188. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  11189. ggml_tensor * cur;
  11190. ggml_tensor * inpL;
  11191. inpL = build_inp_embd(model.tok_embd);
  11192. auto * inp_attn = build_attn_inp_kv();
  11193. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11194. for (int il = 0; il < n_layer; ++il) {
  11195. cur = build_norm(inpL,
  11196. model.layers[il].attn_norm,
  11197. model.layers[il].attn_norm_b,
  11198. LLM_NORM, il);
  11199. cb(cur, "attn_norm", il);
  11200. // self-attention
  11201. {
  11202. cur = build_lora_mm(model.layers[il].wqkv, cur);
  11203. cb(cur, "wqkv", il);
  11204. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  11205. cb(cur, "bqkv", il);
  11206. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*cur->nb[0]*(n_embd));
  11207. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd));
  11208. ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
  11209. cb(Qcur, "Qcur", il);
  11210. cb(Kcur, "Kcur", il);
  11211. cb(Vcur, "Vcur", il);
  11212. cur = build_attn(inp_attn,
  11213. model.layers[il].wo, model.layers[il].bo,
  11214. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
  11215. }
  11216. if (il == n_layer - 1 && inp_out_ids) {
  11217. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11218. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  11219. }
  11220. // add the input
  11221. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  11222. cb(ffn_inp, "ffn_inp", il);
  11223. // FF
  11224. {
  11225. cur = build_norm(ffn_inp,
  11226. model.layers[il].ffn_norm,
  11227. model.layers[il].ffn_norm_b,
  11228. LLM_NORM, il);
  11229. cb(cur, "ffn_norm", il);
  11230. cur = build_ffn(cur,
  11231. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  11232. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  11233. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  11234. NULL,
  11235. LLM_FFN_SILU, LLM_FFN_PAR, il);
  11236. cb(cur, "ffn_out", il);
  11237. }
  11238. inpL = ggml_add(ctx0, cur, ffn_inp);
  11239. cb(inpL, "l_out", il);
  11240. }
  11241. cur = build_norm(inpL,
  11242. model.output_norm,
  11243. model.output_norm_b,
  11244. LLM_NORM, -1);
  11245. cb(cur, "result_norm", -1);
  11246. res->t_embd = cur;
  11247. cur = build_lora_mm(model.output, cur);
  11248. cb(cur, "result_output", -1);
  11249. res->t_logits = cur;
  11250. ggml_build_forward_expand(gf, cur);
  11251. }
  11252. };
  11253. struct llm_build_chatglm : public llm_graph_context {
  11254. llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  11255. const int64_t n_embd_head = hparams.n_embd_head_v;
  11256. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  11257. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  11258. ggml_tensor * cur;
  11259. ggml_tensor * inpL;
  11260. inpL = build_inp_embd(model.tok_embd);
  11261. // inp_pos - contains the positions
  11262. ggml_tensor * inp_pos = build_inp_pos();
  11263. auto * inp_attn = build_attn_inp_kv();
  11264. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11265. for (int il = 0; il < n_layer; ++il) {
  11266. ggml_tensor * inpSA = inpL;
  11267. cur = build_norm(inpL,
  11268. model.layers[il].attn_norm,
  11269. NULL,
  11270. LLM_NORM_RMS, il);
  11271. cb(cur, "attn_norm", il);
  11272. // self-attention
  11273. {
  11274. ggml_tensor * Qcur = nullptr;
  11275. ggml_tensor * Kcur = nullptr;
  11276. ggml_tensor * Vcur = nullptr;
  11277. if (model.layers[il].wqkv == nullptr) {
  11278. Qcur = build_lora_mm(model.layers[il].wq, cur);
  11279. if (model.layers[il].bq) {
  11280. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  11281. }
  11282. Kcur = build_lora_mm(model.layers[il].wk, cur);
  11283. if (model.layers[il].bk) {
  11284. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  11285. }
  11286. Vcur = build_lora_mm(model.layers[il].wv, cur);
  11287. if (model.layers[il].bv) {
  11288. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  11289. }
  11290. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  11291. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  11292. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  11293. } else {
  11294. cur = build_lora_mm(model.layers[il].wqkv, cur);
  11295. cb(cur, "wqkv", il);
  11296. if (model.layers[il].bqkv) {
  11297. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  11298. cb(cur, "bqkv", il);
  11299. }
  11300. Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  11301. Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  11302. Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  11303. }
  11304. //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
  11305. Qcur = ggml_rope_ext(
  11306. ctx0, Qcur, inp_pos, nullptr,
  11307. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11308. ext_factor, attn_factor, beta_fast, beta_slow
  11309. );
  11310. Kcur = ggml_rope_ext(
  11311. ctx0, Kcur, inp_pos, nullptr,
  11312. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11313. ext_factor, attn_factor, beta_fast, beta_slow
  11314. );
  11315. cb(Qcur, "Qcur", il);
  11316. cb(Kcur, "Kcur", il);
  11317. cb(Vcur, "Vcur", il);
  11318. cur = build_attn(inp_attn,
  11319. model.layers[il].wo, NULL,
  11320. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  11321. }
  11322. if (il == n_layer - 1 && inp_out_ids) {
  11323. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11324. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  11325. }
  11326. // Add the input
  11327. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  11328. cb(ffn_inp, "ffn_inp", il);
  11329. // FF
  11330. {
  11331. cur = build_norm(ffn_inp,
  11332. model.layers[il].ffn_norm,
  11333. NULL,
  11334. LLM_NORM_RMS, il);
  11335. cb(cur, "ffn_norm", il);
  11336. cur = build_ffn(cur,
  11337. model.layers[il].ffn_up, NULL, NULL,
  11338. NULL, NULL, NULL,
  11339. model.layers[il].ffn_down, NULL, NULL,
  11340. NULL,
  11341. LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
  11342. cb(cur, "ffn_out", il);
  11343. }
  11344. inpL = ggml_add(ctx0, cur, ffn_inp);
  11345. cb(inpL, "l_out", il);
  11346. }
  11347. cur = build_norm(inpL,
  11348. model.output_norm,
  11349. NULL,
  11350. LLM_NORM_RMS, -1);
  11351. cb(cur, "result_norm", -1);
  11352. res->t_embd = cur;
  11353. cur = build_lora_mm(model.output, cur);
  11354. cb(cur, "result_output", -1);
  11355. res->t_logits = cur;
  11356. ggml_build_forward_expand(gf, cur);
  11357. }
  11358. };
  11359. struct llm_build_glm4 : public llm_graph_context {
  11360. llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  11361. const int64_t n_embd_head = hparams.n_embd_head_v;
  11362. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  11363. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  11364. ggml_tensor * cur;
  11365. ggml_tensor * inpL;
  11366. inpL = build_inp_embd(model.tok_embd);
  11367. // inp_pos - contains the positions
  11368. ggml_tensor * inp_pos = build_inp_pos();
  11369. auto * inp_attn = build_attn_inp_kv();
  11370. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11371. for (int il = 0; il < n_layer; ++il) {
  11372. ggml_tensor * inpSA = inpL;
  11373. // Pre-attention norm
  11374. cur = build_norm(inpL,
  11375. model.layers[il].attn_norm,
  11376. NULL,
  11377. LLM_NORM_RMS, il);
  11378. cb(cur, "attn_norm", il);
  11379. // self-attention
  11380. {
  11381. ggml_tensor * Qcur = nullptr;
  11382. ggml_tensor * Kcur = nullptr;
  11383. ggml_tensor * Vcur = nullptr;
  11384. if (model.layers[il].wqkv == nullptr) {
  11385. Qcur = build_lora_mm(model.layers[il].wq, cur);
  11386. if (model.layers[il].bq) {
  11387. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  11388. }
  11389. Kcur = build_lora_mm(model.layers[il].wk, cur);
  11390. if (model.layers[il].bk) {
  11391. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  11392. }
  11393. Vcur = build_lora_mm(model.layers[il].wv, cur);
  11394. if (model.layers[il].bv) {
  11395. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  11396. }
  11397. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  11398. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  11399. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  11400. } else {
  11401. cur = build_lora_mm(model.layers[il].wqkv, cur);
  11402. cb(cur, "wqkv", il);
  11403. if (model.layers[il].bqkv) {
  11404. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  11405. cb(cur, "bqkv", il);
  11406. }
  11407. Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  11408. Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  11409. Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  11410. }
  11411. Qcur = ggml_rope_ext(
  11412. ctx0, Qcur, inp_pos, nullptr,
  11413. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11414. ext_factor, attn_factor, beta_fast, beta_slow
  11415. );
  11416. Kcur = ggml_rope_ext(
  11417. ctx0, Kcur, inp_pos, nullptr,
  11418. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11419. ext_factor, attn_factor, beta_fast, beta_slow
  11420. );
  11421. cb(Qcur, "Qcur", il);
  11422. cb(Kcur, "Kcur", il);
  11423. cb(Vcur, "Vcur", il);
  11424. cur = build_attn(inp_attn,
  11425. model.layers[il].wo, NULL,
  11426. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  11427. }
  11428. if (il == n_layer - 1 && inp_out_ids) {
  11429. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11430. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  11431. }
  11432. // Post-attention norm (new!)
  11433. cur = build_norm(cur,
  11434. model.layers[il].attn_post_norm,
  11435. NULL,
  11436. LLM_NORM_RMS, il);
  11437. cb(cur, "post_attn_norm", il);
  11438. // Add the input (residual connection after post-attention norm)
  11439. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  11440. cb(ffn_inp, "ffn_inp", il);
  11441. // FF
  11442. {
  11443. // Pre-MLP norm
  11444. cur = build_norm(ffn_inp,
  11445. model.layers[il].ffn_norm,
  11446. NULL,
  11447. LLM_NORM_RMS, il);
  11448. cb(cur, "ffn_norm", il);
  11449. // MLP
  11450. cur = build_ffn(cur,
  11451. model.layers[il].ffn_up, NULL, NULL,
  11452. NULL, NULL, NULL,
  11453. model.layers[il].ffn_down, NULL, NULL,
  11454. NULL,
  11455. LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
  11456. cb(cur, "ffn_out", il);
  11457. // Post-MLP norm
  11458. cur = build_norm(cur,
  11459. model.layers[il].ffn_post_norm,
  11460. NULL,
  11461. LLM_NORM_RMS, il);
  11462. cb(cur, "post_mlp_norm", il);
  11463. }
  11464. // Add residual connection after post-MLP norm
  11465. inpL = ggml_add(ctx0, cur, ffn_inp);
  11466. cb(inpL, "l_out", il);
  11467. }
  11468. // Final norm
  11469. cur = build_norm(inpL,
  11470. model.output_norm,
  11471. NULL,
  11472. LLM_NORM_RMS, -1);
  11473. cb(cur, "result_norm", -1);
  11474. res->t_embd = cur;
  11475. // Output projection
  11476. cur = build_lora_mm(model.output, cur);
  11477. cb(cur, "result_output", -1);
  11478. res->t_logits = cur;
  11479. ggml_build_forward_expand(gf, cur);
  11480. }
  11481. };
  11482. struct llm_build_glm4_moe : public llm_graph_context {
  11483. llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  11484. const int64_t n_embd_head = hparams.n_embd_head_v;
  11485. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  11486. ggml_tensor * cur;
  11487. ggml_tensor * inpL;
  11488. inpL = build_inp_embd(model.tok_embd);
  11489. // inp_pos - contains the positions
  11490. ggml_tensor * inp_pos = build_inp_pos();
  11491. auto * inp_attn = build_attn_inp_kv();
  11492. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11493. // Only process up to last layer (skip final NextN layer)
  11494. // Final layer tensors are loaded but not processed in forward pass
  11495. const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
  11496. for (int il = 0; il < n_transformer_layers; ++il) {
  11497. ggml_tensor * inpSA = inpL;
  11498. // Pre-attention norm
  11499. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  11500. cb(cur, "attn_norm", il);
  11501. // self-attention
  11502. {
  11503. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  11504. if (model.layers[il].bq) {
  11505. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  11506. }
  11507. cb(Qcur, "Qcur", il);
  11508. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  11509. if (model.layers[il].bk) {
  11510. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  11511. }
  11512. cb(Kcur, "Kcur", il);
  11513. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  11514. if (model.layers[il].bv) {
  11515. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  11516. }
  11517. cb(Vcur, "Vcur", il);
  11518. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  11519. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  11520. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  11521. // Apply Q/K norm if available (GLM-4.5 355B variant)
  11522. if (model.layers[il].attn_q_norm) {
  11523. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  11524. cb(Qcur, "Qcur_normed", il);
  11525. }
  11526. if (model.layers[il].attn_k_norm) {
  11527. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  11528. cb(Kcur, "Kcur_normed", il);
  11529. }
  11530. Qcur = ggml_rope_ext(
  11531. ctx0, Qcur, inp_pos, nullptr,
  11532. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11533. ext_factor, attn_factor, beta_fast, beta_slow
  11534. );
  11535. Kcur = ggml_rope_ext(
  11536. ctx0, Kcur, inp_pos, nullptr,
  11537. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11538. ext_factor, attn_factor, beta_fast, beta_slow
  11539. );
  11540. cb(Qcur, "Qcur", il);
  11541. cb(Kcur, "Kcur", il);
  11542. cb(Vcur, "Vcur", il);
  11543. cur = build_attn(inp_attn,
  11544. model.layers[il].wo, NULL,
  11545. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  11546. }
  11547. if (il == n_transformer_layers - 1 && inp_out_ids) {
  11548. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11549. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  11550. }
  11551. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  11552. cb(ffn_inp, "ffn_inp", il);
  11553. // Post-attention norm
  11554. cur = build_norm(ffn_inp, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
  11555. cb(cur, "post_attn_norm", il);
  11556. // Check if this is a dense layer (n_layer_dense_lead=1, so layer 0 is dense)
  11557. if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
  11558. // Dense FFN layer
  11559. cur = build_ffn(cur,
  11560. model.layers[il].ffn_up, NULL, NULL,
  11561. model.layers[il].ffn_gate, NULL, NULL,
  11562. model.layers[il].ffn_down, NULL, NULL,
  11563. NULL,
  11564. LLM_FFN_SILU, LLM_FFN_PAR, il);
  11565. cb(cur, "ffn_out", il);
  11566. } else {
  11567. // Process routed experts using existing MoE infrastructure
  11568. ggml_tensor * routed_out = build_moe_ffn(cur,
  11569. model.layers[il].ffn_gate_inp,
  11570. model.layers[il].ffn_up_exps,
  11571. model.layers[il].ffn_gate_exps,
  11572. model.layers[il].ffn_down_exps,
  11573. model.layers[il].ffn_exp_probs_b,
  11574. n_expert, n_expert_used,
  11575. LLM_FFN_SILU, hparams.expert_weights_norm,
  11576. true, hparams.expert_weights_scale,
  11577. (llama_expert_gating_func_type) hparams.expert_gating_func,
  11578. il);
  11579. cb(routed_out, "ffn_moe_out", il);
  11580. // Process shared expert on original input
  11581. ggml_tensor * shared_out = build_ffn(cur,
  11582. model.layers[il].ffn_up_shexp, NULL, NULL,
  11583. model.layers[il].ffn_gate_shexp, NULL, NULL,
  11584. model.layers[il].ffn_down_shexp, NULL, NULL,
  11585. NULL,
  11586. LLM_FFN_SILU, LLM_FFN_PAR, il);
  11587. cb(shared_out, "ffn_shexp_out", il);
  11588. // Final output: routed_output + shared_output
  11589. cur = ggml_add(ctx0, routed_out, shared_out);
  11590. cb(cur, "ffn_out", il);
  11591. }
  11592. cur = ggml_add(ctx0, cur, ffn_inp);
  11593. cur = build_cvec(cur, il);
  11594. cb(cur, "l_out", il);
  11595. // input for next layer
  11596. inpL = cur;
  11597. }
  11598. cur = inpL;
  11599. cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
  11600. cb(cur, "result_norm", -1);
  11601. res->t_embd = cur;
  11602. // lm_head
  11603. cur = build_lora_mm(model.output, cur);
  11604. cb(cur, "result_output", -1);
  11605. res->t_logits = cur;
  11606. ggml_build_forward_expand(gf, cur);
  11607. }
  11608. };
  11609. struct llm_build_nemotron : public llm_graph_context {
  11610. llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  11611. const int64_t n_embd_head = hparams.n_embd_head_v;
  11612. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  11613. //GGML_ASSERT(n_embd_head == hparams.n_rot);
  11614. ggml_tensor * cur;
  11615. ggml_tensor * inpL;
  11616. inpL = build_inp_embd(model.tok_embd);
  11617. // inp_pos - contains the positions
  11618. ggml_tensor * inp_pos = build_inp_pos();
  11619. auto * inp_attn = build_attn_inp_kv();
  11620. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11621. for (int il = 0; il < n_layer; ++il) {
  11622. ggml_tensor * inpSA = inpL;
  11623. // norm
  11624. cur = build_norm(inpL,
  11625. model.layers[il].attn_norm,
  11626. model.layers[il].attn_norm_b,
  11627. LLM_NORM, il);
  11628. cb(cur, "attn_norm", il);
  11629. // self-attention
  11630. {
  11631. // compute Q and K and RoPE them
  11632. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  11633. cb(Qcur, "Qcur", il);
  11634. if (model.layers[il].bq) {
  11635. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  11636. cb(Qcur, "Qcur", il);
  11637. }
  11638. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  11639. cb(Kcur, "Kcur", il);
  11640. if (model.layers[il].bk) {
  11641. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  11642. cb(Kcur, "Kcur", il);
  11643. }
  11644. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  11645. cb(Vcur, "Vcur", il);
  11646. if (model.layers[il].bv) {
  11647. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  11648. cb(Vcur, "Vcur", il);
  11649. }
  11650. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  11651. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  11652. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  11653. Qcur = ggml_rope_ext(
  11654. ctx0, Qcur, inp_pos, nullptr,
  11655. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11656. ext_factor, attn_factor, beta_fast, beta_slow
  11657. );
  11658. Kcur = ggml_rope_ext(
  11659. ctx0, Kcur, inp_pos, nullptr,
  11660. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11661. ext_factor, attn_factor, beta_fast, beta_slow
  11662. );
  11663. cb(Qcur, "Qcur", il);
  11664. cb(Kcur, "Kcur", il);
  11665. cb(Vcur, "Vcur", il);
  11666. cur = build_attn(inp_attn,
  11667. model.layers[il].wo, model.layers[il].bo,
  11668. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  11669. }
  11670. if (il == n_layer - 1 && inp_out_ids) {
  11671. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11672. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  11673. }
  11674. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  11675. cb(ffn_inp, "ffn_inp", il);
  11676. // feed-forward network
  11677. cur = build_norm(ffn_inp,
  11678. model.layers[il].ffn_norm,
  11679. model.layers[il].ffn_norm_b,
  11680. LLM_NORM, il);
  11681. cb(cur, "ffn_norm", il);
  11682. cur = build_ffn(cur,
  11683. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  11684. NULL, NULL, NULL,
  11685. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  11686. NULL,
  11687. LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
  11688. cur = ggml_add(ctx0, cur, ffn_inp);
  11689. cb(cur, "ffn_out", il);
  11690. cur = build_cvec(cur, il);
  11691. cb(cur, "l_out", il);
  11692. // input for next layer
  11693. inpL = cur;
  11694. }
  11695. cur = inpL;
  11696. cur = build_norm(cur,
  11697. model.output_norm, model.output_norm_b,
  11698. LLM_NORM, -1);
  11699. cb(cur, "result_norm", -1);
  11700. res->t_embd = cur;
  11701. // lm_head
  11702. cur = build_lora_mm(model.output, cur);
  11703. cb(cur, "result_output", -1);
  11704. res->t_logits = cur;
  11705. ggml_build_forward_expand(gf, cur);
  11706. }
  11707. };
  11708. struct llm_build_nemotron_h : public llm_graph_context_mamba {
  11709. llm_build_nemotron_h(
  11710. const llama_model & model,
  11711. const llm_graph_params & params) :
  11712. llm_graph_context_mamba(params) {
  11713. const int64_t n_embd_head = hparams.n_embd_head_v;
  11714. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  11715. ggml_tensor * cur;
  11716. ggml_tensor * inpL;
  11717. inpL = build_inp_embd(model.tok_embd);
  11718. auto * inp = build_inp_mem_hybrid();
  11719. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11720. for (int il = 0; il < n_layer; ++il) {
  11721. struct ggml_tensor * inpSA = inpL;
  11722. // norm
  11723. cur = build_norm(inpL,
  11724. model.layers[il].attn_norm, NULL,
  11725. LLM_NORM_RMS, il);
  11726. cb(cur, "attn_norm", il);
  11727. if (hparams.is_recurrent(il)) {
  11728. // ssm layer //
  11729. cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
  11730. } else if (hparams.n_ff(il) == 0) {
  11731. // attention layer //
  11732. cur = build_attention_layer(cur, inp->get_attn(), model, n_embd_head, il);
  11733. } else {
  11734. cur = build_ffn_layer(cur, model, il);
  11735. }
  11736. if (il == n_layer - 1 && inp_out_ids) {
  11737. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11738. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  11739. }
  11740. // add residual
  11741. cur = ggml_add(ctx0, cur, inpSA);
  11742. cb(cur, "block_out", il);
  11743. // input for next layer
  11744. inpL = cur;
  11745. }
  11746. cur = inpL;
  11747. cur = build_norm(cur,
  11748. model.output_norm, NULL,
  11749. LLM_NORM_RMS, -1);
  11750. cb(cur, "result_norm", -1);
  11751. res->t_embd = cur;
  11752. // lm_head
  11753. cur = build_lora_mm(model.output, cur);
  11754. cb(cur, "result_output", -1);
  11755. res->t_logits = cur;
  11756. ggml_build_forward_expand(gf, cur);
  11757. }
  11758. ggml_tensor * build_attention_layer(
  11759. ggml_tensor * cur,
  11760. llm_graph_input_attn_kv * inp_attn,
  11761. const llama_model & model,
  11762. const int64_t n_embd_head,
  11763. const int il) {
  11764. // compute Q and K and (optionally) RoPE them
  11765. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  11766. cb(Qcur, "Qcur", il);
  11767. if (model.layers[il].bq) {
  11768. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  11769. cb(Qcur, "Qcur", il);
  11770. }
  11771. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  11772. cb(Kcur, "Kcur", il);
  11773. if (model.layers[il].bk) {
  11774. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  11775. cb(Kcur, "Kcur", il);
  11776. }
  11777. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  11778. cb(Vcur, "Vcur", il);
  11779. if (model.layers[il].bv) {
  11780. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  11781. cb(Vcur, "Vcur", il);
  11782. }
  11783. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
  11784. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
  11785. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
  11786. cb(Qcur, "Qcur", il);
  11787. cb(Kcur, "Kcur", il);
  11788. cb(Vcur, "Vcur", il);
  11789. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  11790. cur = build_attn(inp_attn,
  11791. model.layers[il].wo, model.layers[il].bo,
  11792. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  11793. cb(cur, "attn_out", il);
  11794. return cur;
  11795. }
  11796. ggml_tensor * build_ffn_layer(
  11797. ggml_tensor * cur,
  11798. const llama_model & model,
  11799. const int il) {
  11800. cur = build_ffn(cur,
  11801. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  11802. NULL, NULL, NULL,
  11803. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  11804. NULL,
  11805. LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
  11806. cb(cur, "ffn_out", il);
  11807. cur = build_cvec(cur, il);
  11808. cb(cur, "l_out", il);
  11809. return cur;
  11810. }
  11811. };
  11812. struct llm_build_exaone : public llm_graph_context {
  11813. llm_build_exaone(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  11814. const int64_t n_embd_head = hparams.n_embd_head_v;
  11815. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  11816. GGML_ASSERT(n_embd_head == hparams.n_rot);
  11817. ggml_tensor * cur;
  11818. ggml_tensor * inpL;
  11819. inpL = build_inp_embd(model.tok_embd);
  11820. // inp_pos - contains the positions
  11821. ggml_tensor * inp_pos = build_inp_pos();
  11822. auto * inp_attn = build_attn_inp_kv();
  11823. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11824. for (int il = 0; il < n_layer; ++il) {
  11825. ggml_tensor * inpSA = inpL;
  11826. // norm
  11827. cur = build_norm(inpL,
  11828. model.layers[il].attn_norm, NULL,
  11829. LLM_NORM_RMS, il);
  11830. cb(cur, "attn_norm", il);
  11831. // self-attention
  11832. {
  11833. // rope freq factors for llama3; may return nullptr for llama2 and other models
  11834. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  11835. // compute Q and K and RoPE them
  11836. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  11837. cb(Qcur, "Qcur", il);
  11838. if (model.layers[il].bq) {
  11839. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  11840. cb(Qcur, "Qcur", il);
  11841. }
  11842. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  11843. cb(Kcur, "Kcur", il);
  11844. if (model.layers[il].bk) {
  11845. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  11846. cb(Kcur, "Kcur", il);
  11847. }
  11848. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  11849. cb(Vcur, "Vcur", il);
  11850. if (model.layers[il].bv) {
  11851. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  11852. cb(Vcur, "Vcur", il);
  11853. }
  11854. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  11855. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  11856. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  11857. Qcur = ggml_rope_ext(
  11858. ctx0, Qcur, inp_pos, rope_factors,
  11859. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11860. ext_factor, attn_factor, beta_fast, beta_slow
  11861. );
  11862. Kcur = ggml_rope_ext(
  11863. ctx0, Kcur, inp_pos, rope_factors,
  11864. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11865. ext_factor, attn_factor, beta_fast, beta_slow
  11866. );
  11867. cb(Qcur, "Qcur", il);
  11868. cb(Kcur, "Kcur", il);
  11869. cb(Vcur, "Vcur", il);
  11870. cur = build_attn(inp_attn,
  11871. model.layers[il].wo, model.layers[il].bo,
  11872. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  11873. }
  11874. if (il == n_layer - 1 && inp_out_ids) {
  11875. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11876. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  11877. }
  11878. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  11879. cb(ffn_inp, "ffn_inp", il);
  11880. // feed-forward network
  11881. cur = build_norm(ffn_inp,
  11882. model.layers[il].ffn_norm, NULL,
  11883. LLM_NORM_RMS, il);
  11884. cb(cur, "ffn_norm", il);
  11885. cur = build_ffn(cur,
  11886. model.layers[il].ffn_up, NULL, NULL,
  11887. model.layers[il].ffn_gate, NULL, NULL,
  11888. model.layers[il].ffn_down, NULL, NULL,
  11889. NULL,
  11890. LLM_FFN_SILU, LLM_FFN_PAR, il);
  11891. cb(cur, "ffn_out", il);
  11892. cur = ggml_add(ctx0, cur, ffn_inp);
  11893. cb(cur, "ffn_out", il);
  11894. cur = build_cvec(cur, il);
  11895. cb(cur, "l_out", il);
  11896. // input for next layer
  11897. inpL = cur;
  11898. }
  11899. cur = inpL;
  11900. cur = build_norm(cur,
  11901. model.output_norm, NULL,
  11902. LLM_NORM_RMS, -1);
  11903. cb(cur, "result_norm", -1);
  11904. res->t_embd = cur;
  11905. // lm_head
  11906. cur = build_lora_mm(model.output, cur);
  11907. cb(cur, "result_output", -1);
  11908. res->t_logits = cur;
  11909. ggml_build_forward_expand(gf, cur);
  11910. }
  11911. };
  11912. template <bool iswa>
  11913. struct llm_build_exaone4 : public llm_graph_context {
  11914. llm_build_exaone4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  11915. const int64_t n_embd_head = hparams.n_embd_head_k;
  11916. GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
  11917. GGML_ASSERT(n_embd_head == hparams.n_rot);
  11918. ggml_tensor * cur;
  11919. ggml_tensor * inpL;
  11920. inpL = build_inp_embd(model.tok_embd);
  11921. // inp_pos - contains the positions
  11922. ggml_tensor * inp_pos = build_inp_pos();
  11923. using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
  11924. inp_attn_type * inp_attn = nullptr;
  11925. if constexpr (iswa) {
  11926. inp_attn = build_attn_inp_kv_iswa();
  11927. } else {
  11928. inp_attn = build_attn_inp_kv();
  11929. }
  11930. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11931. for (int il = 0; il < n_layer; ++il) {
  11932. ggml_tensor * inpSA = inpL;
  11933. // use RoPE for SWA layers or non-SWA models
  11934. const bool use_rope = hparams.is_swa(il) || hparams.swa_type == LLAMA_SWA_TYPE_NONE;
  11935. cur = inpL;
  11936. // self-attention
  11937. {
  11938. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  11939. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  11940. cb(Qcur, "Qcur", il);
  11941. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  11942. cb(Kcur, "Kcur", il);
  11943. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  11944. cb(Vcur, "Vcur", il);
  11945. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  11946. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  11947. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  11948. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  11949. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  11950. cb(Qcur, "Qcur_normed", il);
  11951. cb(Kcur, "Kcur_normed", il);
  11952. if (use_rope) {
  11953. Qcur = ggml_rope_ext(
  11954. ctx0, Qcur, inp_pos, rope_factors,
  11955. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11956. ext_factor, attn_factor, beta_fast, beta_slow
  11957. );
  11958. Kcur = ggml_rope_ext(
  11959. ctx0, Kcur, inp_pos, rope_factors,
  11960. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11961. ext_factor, attn_factor, beta_fast, beta_slow
  11962. );
  11963. }
  11964. cb(Qcur, "Qcur", il);
  11965. cb(Kcur, "Kcur", il);
  11966. cb(Vcur, "Vcur", il);
  11967. cur = build_attn(inp_attn,
  11968. model.layers[il].wo, NULL,
  11969. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  11970. cb(cur, "attn_out", il);
  11971. }
  11972. if (il == n_layer - 1 && inp_out_ids) {
  11973. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11974. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  11975. }
  11976. cur = build_norm(cur,
  11977. model.layers[il].attn_post_norm, NULL,
  11978. LLM_NORM_RMS, il);
  11979. cb(cur, "attn_post_norm", il);
  11980. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  11981. cb(ffn_inp, "ffn_inp", il);
  11982. // feed-forward network
  11983. cur = build_ffn(ffn_inp,
  11984. model.layers[il].ffn_up, NULL, NULL,
  11985. model.layers[il].ffn_gate, NULL, NULL,
  11986. model.layers[il].ffn_down, NULL, NULL,
  11987. NULL,
  11988. LLM_FFN_SILU, LLM_FFN_PAR, il);
  11989. cb(cur, "ffn_out", il);
  11990. cur = build_norm(cur,
  11991. model.layers[il].ffn_post_norm, NULL,
  11992. LLM_NORM_RMS, -1);
  11993. cb(cur, "ffn_post_norm", -1);
  11994. cur = ggml_add(ctx0, cur, ffn_inp);
  11995. cur = build_cvec(cur, il);
  11996. cb(cur, "l_out", il);
  11997. // input for next layer
  11998. inpL = cur;
  11999. }
  12000. cur = inpL;
  12001. cur = build_norm(cur,
  12002. model.output_norm, NULL,
  12003. LLM_NORM_RMS, -1);
  12004. cb(cur, "result_norm", -1);
  12005. res->t_embd = cur;
  12006. // lm_head
  12007. cur = build_lora_mm(model.output, cur);
  12008. cb(cur, "result_output", -1);
  12009. res->t_logits = cur;
  12010. ggml_build_forward_expand(gf, cur);
  12011. }
  12012. };
  12013. struct llm_build_rwkv6_base : public llm_graph_context {
  12014. const llama_model & model;
  12015. llm_build_rwkv6_base(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
  12016. }
  12017. ggml_tensor * build_rwkv6_channel_mix(
  12018. const llama_layer * layer,
  12019. ggml_tensor * cur,
  12020. ggml_tensor * x_prev,
  12021. llm_arch arch) const {
  12022. ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
  12023. switch (arch) {
  12024. case LLM_ARCH_RWKV6:
  12025. {
  12026. ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
  12027. ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur);
  12028. ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr));
  12029. ggml_tensor * k = ggml_sqr(
  12030. ctx0,
  12031. ggml_relu(
  12032. ctx0,
  12033. build_lora_mm(layer->channel_mix_key, xk)
  12034. )
  12035. );
  12036. cur = ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k));
  12037. } break;
  12038. default:
  12039. GGML_ABORT("fatal error");
  12040. }
  12041. return cur;
  12042. }
  12043. ggml_tensor * build_rwkv6_time_mix(
  12044. llm_graph_input_rs * inp,
  12045. ggml_tensor * cur,
  12046. ggml_tensor * x_prev,
  12047. const llama_ubatch & ubatch,
  12048. int il) const {
  12049. const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
  12050. const auto n_tokens = ubatch.n_tokens;
  12051. const auto n_seqs = ubatch.n_seqs;
  12052. const auto n_seq_tokens = ubatch.n_seq_tokens;
  12053. const auto n_embd = hparams.n_embd;
  12054. const auto head_size = hparams.wkv_head_size;
  12055. const auto n_head = n_embd / head_size;
  12056. const auto n_head_kv = hparams.n_head_kv(il);
  12057. const auto kv_head = mctx_cur->get_head();
  12058. const auto & layer = model.layers[il];
  12059. bool is_qrwkv = layer.time_mix_first == nullptr;
  12060. ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
  12061. sx = ggml_reshape_2d(ctx0, sx, n_embd, n_tokens);
  12062. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  12063. ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_x), cur);
  12064. xxx = ggml_reshape_4d(
  12065. ctx0,
  12066. ggml_tanh(
  12067. ctx0,
  12068. ggml_mul_mat(ctx0, layer.time_mix_w1, xxx)
  12069. ),
  12070. layer.time_mix_w1->ne[1] / 5, 1, 5, n_tokens
  12071. );
  12072. xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2));
  12073. xxx = ggml_mul_mat(
  12074. ctx0,
  12075. ggml_reshape_4d(
  12076. ctx0,
  12077. layer.time_mix_w2,
  12078. layer.time_mix_w2->ne[0], layer.time_mix_w2->ne[1], 1, 5
  12079. ),
  12080. xxx
  12081. );
  12082. ggml_tensor *xw, *xk, *xv, *xr, *xg;
  12083. if (layer.time_mix_lerp_fused) {
  12084. // fusing these weights makes some performance improvement
  12085. sx = ggml_reshape_3d(ctx0, sx, n_embd, 1, n_tokens);
  12086. cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
  12087. xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer.time_mix_lerp_fused), sx), cur);
  12088. xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
  12089. xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
  12090. xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
  12091. xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
  12092. xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
  12093. } else {
  12094. // for backward compatibility
  12095. xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
  12096. xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
  12097. xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
  12098. xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
  12099. xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
  12100. xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer.time_mix_lerp_w), sx), cur);
  12101. xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer.time_mix_lerp_k), sx), cur);
  12102. xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer.time_mix_lerp_v), sx), cur);
  12103. xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer.time_mix_lerp_r), sx), cur);
  12104. xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer.time_mix_lerp_g), sx), cur);
  12105. }
  12106. ggml_tensor * r = build_lora_mm(layer.time_mix_receptance, xr);
  12107. ggml_tensor * k = build_lora_mm(layer.time_mix_key, xk);
  12108. ggml_tensor * v = build_lora_mm(layer.time_mix_value, xv);
  12109. if (layer.time_mix_receptance_b) {
  12110. r = ggml_add(ctx0, r, layer.time_mix_receptance_b);
  12111. }
  12112. if (layer.time_mix_key_b) {
  12113. k = ggml_add(ctx0, k, layer.time_mix_key_b);
  12114. }
  12115. if (layer.time_mix_value_b) {
  12116. v = ggml_add(ctx0, v, layer.time_mix_value_b);
  12117. }
  12118. ggml_tensor * g = build_lora_mm(layer.time_mix_gate, xg);
  12119. if (is_qrwkv) {
  12120. g = ggml_sigmoid(ctx0, g);
  12121. } else {
  12122. g = ggml_silu(ctx0, g);
  12123. }
  12124. if (n_head_kv != 0 && n_head_kv != n_head) {
  12125. GGML_ASSERT(n_head % n_head_kv == 0);
  12126. k = ggml_reshape_4d(ctx0, k, head_size, 1, n_head_kv, n_tokens);
  12127. v = ggml_reshape_4d(ctx0, v, head_size, 1, n_head_kv, n_tokens);
  12128. ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, n_head / n_head_kv, n_head_kv, n_tokens);
  12129. k = ggml_repeat(ctx0, k, tmp);
  12130. v = ggml_repeat(ctx0, v, tmp);
  12131. }
  12132. k = ggml_reshape_3d(ctx0, k, head_size, n_head, n_tokens);
  12133. v = ggml_reshape_3d(ctx0, v, head_size, n_head, n_tokens);
  12134. r = ggml_reshape_3d(ctx0, r, head_size, n_head, n_tokens);
  12135. ggml_tensor * w = ggml_mul_mat(
  12136. ctx0,
  12137. layer.time_mix_decay_w2,
  12138. ggml_tanh(
  12139. ctx0,
  12140. ggml_mul_mat(ctx0, layer.time_mix_decay_w1, xw)
  12141. )
  12142. );
  12143. w = ggml_add(ctx0, w, layer.time_mix_decay);
  12144. w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w)));
  12145. w = ggml_reshape_3d(ctx0, w, head_size, n_head, n_tokens);
  12146. if (is_qrwkv) {
  12147. // k = k * (1 - w)
  12148. k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
  12149. }
  12150. ggml_tensor * wkv_state = build_rs(
  12151. inp, mctx_cur->get_s_l(il),
  12152. hparams.n_embd_s(), n_seqs);
  12153. ggml_tensor * wkv_output;
  12154. if (is_qrwkv) {
  12155. wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, wkv_state, pow(head_size, -0.5f));
  12156. } else {
  12157. wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer.time_mix_first, w, wkv_state);
  12158. }
  12159. cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
  12160. wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
  12161. ggml_build_forward_expand(
  12162. gf,
  12163. ggml_cpy(
  12164. ctx0,
  12165. wkv_state,
  12166. ggml_view_1d(
  12167. ctx0,
  12168. mctx_cur->get_s_l(il),
  12169. hparams.n_embd_s() * n_seqs,
  12170. hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
  12171. )
  12172. )
  12173. );
  12174. if (!is_qrwkv) {
  12175. // group norm with head_count groups
  12176. cur = ggml_reshape_3d(ctx0, cur, n_embd / n_head, n_head, n_tokens);
  12177. cur = ggml_norm(ctx0, cur, 64e-5f);
  12178. // Convert back to regular vectors.
  12179. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  12180. cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.time_mix_ln), layer.time_mix_ln_b);
  12181. } else {
  12182. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  12183. }
  12184. cur = ggml_mul(ctx0, cur, g);
  12185. cur = build_lora_mm(layer.time_mix_output, cur);
  12186. return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
  12187. }
  12188. };
  12189. struct llm_build_rwkv6 : public llm_build_rwkv6_base {
  12190. llm_build_rwkv6(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) {
  12191. GGML_ASSERT(hparams.token_shift_count == 2);
  12192. ggml_tensor * cur;
  12193. ggml_tensor * inpL;
  12194. inpL = build_inp_embd(model.tok_embd);
  12195. inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
  12196. auto * rs_inp = build_rs_inp();
  12197. const auto n_embd = hparams.n_embd;
  12198. const auto n_seq_tokens = ubatch.n_seq_tokens;
  12199. const auto n_seqs = ubatch.n_seqs;
  12200. ggml_tensor * inp_out_ids = build_inp_out_ids();
  12201. for (int il = 0; il < n_layer; ++il) {
  12202. const llama_layer * layer = &model.layers[il];
  12203. inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
  12204. ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
  12205. ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
  12206. ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
  12207. ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
  12208. cb(att_norm, "attn_norm", il);
  12209. ggml_tensor * x_prev = ggml_concat(
  12210. ctx0,
  12211. att_shift,
  12212. ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
  12213. 1
  12214. );
  12215. cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il);
  12216. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  12217. cb(ffn_inp, "ffn_inp", il);
  12218. ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
  12219. cb(ffn_norm, "ffn_norm", il);
  12220. x_prev = ggml_concat(
  12221. ctx0,
  12222. ffn_shift,
  12223. ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0),
  12224. 1
  12225. );
  12226. token_shift = ggml_concat(ctx0,
  12227. ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)),
  12228. ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
  12229. 1
  12230. );
  12231. ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
  12232. ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
  12233. ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
  12234. x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
  12235. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  12236. if (il == n_layer - 1 && inp_out_ids) {
  12237. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  12238. ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
  12239. x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
  12240. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12241. }
  12242. cur = build_rwkv6_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6);
  12243. cur = ggml_add(ctx0, cur, ffn_inp);
  12244. if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
  12245. cur = ggml_scale(ctx0, cur, 0.5F);
  12246. }
  12247. cur = build_cvec(cur, il);
  12248. cb(cur, "l_out", il);
  12249. // input for next layer
  12250. inpL = cur;
  12251. }
  12252. cur = inpL;
  12253. cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
  12254. cb(cur, "result_norm", -1);
  12255. res->t_embd = cur;
  12256. cur = build_lora_mm(model.output, cur);
  12257. cb(cur, "result_output", -1);
  12258. res->t_logits = cur;
  12259. ggml_build_forward_expand(gf, cur);
  12260. }
  12261. };
  12262. // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
  12263. struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
  12264. llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) {
  12265. GGML_ASSERT(n_embd == hparams.n_embd_r());
  12266. ggml_tensor * cur;
  12267. ggml_tensor * inpL;
  12268. inpL = build_inp_embd(model.tok_embd);
  12269. auto * rs_inp = build_rs_inp();
  12270. const auto n_embd = hparams.n_embd;
  12271. const auto n_seq_tokens = ubatch.n_seq_tokens;
  12272. const auto n_seqs = ubatch.n_seqs;
  12273. ggml_tensor * inp_out_ids = build_inp_out_ids();
  12274. for (int il = 0; il < n_layer; ++il) {
  12275. const llama_layer * layer = &model.layers[il];
  12276. inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
  12277. ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
  12278. ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
  12279. cb(att_norm, "attn_norm", il);
  12280. ggml_tensor * x_prev = ggml_concat(
  12281. ctx0,
  12282. token_shift,
  12283. ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
  12284. 1
  12285. );
  12286. cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il);
  12287. token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
  12288. ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
  12289. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  12290. cb(ffn_inp, "ffn_inp", il);
  12291. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  12292. ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
  12293. if (il == n_layer - 1 && inp_out_ids) {
  12294. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12295. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  12296. }
  12297. // feed-forward network
  12298. cur = build_norm(ffn_inp,
  12299. model.layers[il].ffn_norm, NULL,
  12300. LLM_NORM_RMS, il);
  12301. cb(cur, "ffn_norm", il);
  12302. cur = build_ffn(cur,
  12303. model.layers[il].ffn_up, NULL, NULL,
  12304. model.layers[il].ffn_gate, NULL, NULL,
  12305. model.layers[il].ffn_down, NULL, NULL,
  12306. NULL,
  12307. LLM_FFN_SILU, LLM_FFN_PAR, il);
  12308. cb(cur, "ffn_out", il);
  12309. cur = ggml_add(ctx0, cur, ffn_inp);
  12310. cur = build_cvec(cur, il);
  12311. cb(cur, "l_out", il);
  12312. // input for next layer
  12313. inpL = cur;
  12314. }
  12315. cur = inpL;
  12316. cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
  12317. cb(cur, "result_norm", -1);
  12318. res->t_embd = cur;
  12319. cur = build_lora_mm(model.output, cur);
  12320. cb(cur, "result_output", -1);
  12321. res->t_logits = cur;
  12322. ggml_build_forward_expand(gf, cur);
  12323. }
  12324. };
  12325. struct llm_build_rwkv7_base : public llm_graph_context {
  12326. const llama_model & model;
  12327. llm_build_rwkv7_base(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
  12328. }
  12329. ggml_tensor * build_rwkv7_channel_mix(
  12330. const llama_layer * layer,
  12331. ggml_tensor * cur,
  12332. ggml_tensor * x_prev,
  12333. llm_arch arch) const {
  12334. ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
  12335. switch (arch) {
  12336. case LLM_ARCH_RWKV7:
  12337. {
  12338. ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
  12339. ggml_tensor * k = ggml_sqr(
  12340. ctx0,
  12341. ggml_relu(
  12342. ctx0,
  12343. build_lora_mm(layer->channel_mix_key, xk)
  12344. )
  12345. );
  12346. cur = build_lora_mm(layer->channel_mix_value, k);
  12347. } break;
  12348. default:
  12349. GGML_ABORT("fatal error");
  12350. }
  12351. return cur;
  12352. }
  12353. ggml_tensor * build_rwkv7_time_mix(
  12354. llm_graph_input_rs * inp,
  12355. ggml_tensor * cur,
  12356. ggml_tensor * x_prev,
  12357. ggml_tensor *& first_layer_value,
  12358. const llama_ubatch & ubatch,
  12359. int il) const {
  12360. const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
  12361. const auto n_tokens = ubatch.n_tokens;
  12362. const auto n_seqs = ubatch.n_seqs;
  12363. const auto n_embd = hparams.n_embd;
  12364. const auto head_size = hparams.wkv_head_size;
  12365. const auto head_count = n_embd / head_size;
  12366. const auto n_seq_tokens = ubatch.n_seq_tokens;
  12367. const auto kv_head = mctx_cur->get_head();
  12368. const auto & layer = model.layers[il];
  12369. bool has_gating = layer.time_mix_g1 && layer.time_mix_g2;
  12370. ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
  12371. ggml_tensor * dummy = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_embd, n_seq_tokens, n_seqs, has_gating ? 6 : 5);
  12372. sx = ggml_repeat(ctx0, sx, dummy);
  12373. ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_fused), cur);
  12374. ggml_tensor * xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
  12375. ggml_tensor * xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
  12376. ggml_tensor * xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
  12377. ggml_tensor * xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
  12378. ggml_tensor * xa = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
  12379. ggml_tensor * xg = has_gating ? ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 5 * sizeof(float)) : nullptr;
  12380. ggml_tensor * r = build_lora_mm(layer.time_mix_receptance, xr);
  12381. ggml_tensor * w = ggml_add(
  12382. ctx0,
  12383. ggml_mul_mat(ctx0, layer.time_mix_w2, ggml_tanh(ctx0, ggml_mul_mat(ctx0, layer.time_mix_w1, xw))),
  12384. layer.time_mix_w0
  12385. );
  12386. w = ggml_exp(ctx0, ggml_scale(ctx0, ggml_sigmoid(ctx0, w), -0.606531));
  12387. ggml_tensor * k = build_lora_mm(layer.time_mix_key, xk);
  12388. ggml_tensor * v = build_lora_mm(layer.time_mix_value, xv);
  12389. if (first_layer_value == nullptr) {
  12390. first_layer_value = v;
  12391. } else {
  12392. // Add the first layer value as a residual connection.
  12393. v = ggml_add(ctx0, v,
  12394. ggml_mul(ctx0,
  12395. ggml_sub(ctx0, first_layer_value, v),
  12396. ggml_sigmoid(ctx0, ggml_add(ctx0,
  12397. ggml_mul_mat(ctx0, layer.time_mix_v2, ggml_mul_mat(ctx0, layer.time_mix_v1, xv)),
  12398. layer.time_mix_v0
  12399. )
  12400. )
  12401. )
  12402. );
  12403. }
  12404. ggml_tensor * g = nullptr;
  12405. if (layer.time_mix_g1 && layer.time_mix_g2) {
  12406. g = ggml_mul_mat(ctx0, layer.time_mix_g2, ggml_sigmoid(ctx0, ggml_mul_mat(ctx0, layer.time_mix_g1, xg)));
  12407. }
  12408. ggml_tensor * a = ggml_sigmoid(ctx0,
  12409. ggml_add(
  12410. ctx0,
  12411. ggml_mul_mat(ctx0, layer.time_mix_a2, ggml_mul_mat(ctx0, layer.time_mix_a1, xa)),
  12412. layer.time_mix_a0
  12413. )
  12414. );
  12415. ggml_tensor * kk = ggml_reshape_3d(ctx0, ggml_mul(ctx0, k, layer.time_mix_k_k), head_size, head_count, n_tokens);
  12416. kk = ggml_l2_norm(ctx0, kk, 1e-12);
  12417. ggml_tensor * ka = ggml_mul(ctx0, k, layer.time_mix_k_a);
  12418. k = ggml_add(ctx0, k, ggml_sub(ctx0, ggml_mul(ctx0, a, ka), ka));
  12419. r = ggml_reshape_3d(ctx0, r, head_size, head_count, n_tokens);
  12420. w = ggml_reshape_3d(ctx0, w, head_size, head_count, n_tokens);
  12421. k = ggml_reshape_3d(ctx0, k, head_size, head_count, n_tokens);
  12422. v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
  12423. a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
  12424. ggml_tensor * wkv_state = build_rs(
  12425. inp, mctx_cur->get_s_l(il),
  12426. hparams.n_embd_s(), n_seqs);
  12427. ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
  12428. cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
  12429. wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
  12430. ggml_build_forward_expand(
  12431. gf,
  12432. ggml_cpy(
  12433. ctx0,
  12434. wkv_state,
  12435. ggml_view_1d(
  12436. ctx0,
  12437. mctx_cur->get_s_l(il),
  12438. hparams.n_embd_s() * n_seqs,
  12439. hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
  12440. )
  12441. )
  12442. );
  12443. if (layer.time_mix_ln && layer.time_mix_ln_b) {
  12444. // group norm with head_count groups
  12445. cur = ggml_reshape_3d(ctx0, cur, n_embd / head_count, head_count, n_tokens);
  12446. cur = ggml_norm(ctx0, cur, 64e-5f);
  12447. // Convert back to regular vectors.
  12448. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  12449. cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.time_mix_ln), layer.time_mix_ln_b);
  12450. } else {
  12451. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  12452. }
  12453. ggml_tensor * rk = ggml_sum_rows(ctx0,
  12454. ggml_mul(ctx0, ggml_mul(ctx0, k, r), ggml_reshape_2d(ctx0, layer.time_mix_r_k, head_size, head_count)));
  12455. cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, ggml_mul(ctx0, v, rk), n_embd, n_tokens));
  12456. if (has_gating) {
  12457. cur = ggml_mul(ctx0, cur, g);
  12458. }
  12459. cur = build_lora_mm(layer.time_mix_output, cur);
  12460. return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
  12461. }
  12462. };
  12463. struct llm_build_rwkv7 : public llm_build_rwkv7_base {
  12464. llm_build_rwkv7(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv7_base(model, params) {
  12465. GGML_ASSERT(hparams.token_shift_count == 2);
  12466. ggml_tensor * cur;
  12467. ggml_tensor * inpL;
  12468. ggml_tensor * v_first = nullptr;
  12469. inpL = build_inp_embd(model.tok_embd);
  12470. inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
  12471. auto * rs_inp = build_rs_inp();
  12472. const auto n_embd = hparams.n_embd;
  12473. const auto n_seq_tokens = ubatch.n_seq_tokens;
  12474. const auto n_seqs = ubatch.n_seqs;
  12475. ggml_tensor * inp_out_ids = build_inp_out_ids();
  12476. for (int il = 0; il < n_layer; ++il) {
  12477. const llama_layer * layer = &model.layers[il];
  12478. inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
  12479. ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
  12480. ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
  12481. ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
  12482. ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
  12483. cb(att_norm, "attn_norm", il);
  12484. ggml_tensor * x_prev = ggml_concat(
  12485. ctx0,
  12486. att_shift,
  12487. ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
  12488. 1
  12489. );
  12490. cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il);
  12491. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  12492. cb(ffn_inp, "ffn_inp", il);
  12493. ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
  12494. cb(ffn_norm, "ffn_norm", il);
  12495. x_prev = ggml_concat(
  12496. ctx0,
  12497. ffn_shift,
  12498. ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0),
  12499. 1
  12500. );
  12501. token_shift = ggml_concat(ctx0,
  12502. ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)),
  12503. ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
  12504. 1
  12505. );
  12506. ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
  12507. ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
  12508. ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
  12509. x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
  12510. if (il == n_layer - 1 && inp_out_ids) {
  12511. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  12512. ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
  12513. x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
  12514. }
  12515. cur = build_rwkv7_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV7);
  12516. cur = ggml_add(ctx0, cur, ffn_inp);
  12517. cur = build_cvec(cur, il);
  12518. cb(cur, "l_out", il);
  12519. // input for next layer
  12520. inpL = cur;
  12521. }
  12522. cur = inpL;
  12523. cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
  12524. cb(cur, "result_norm", -1);
  12525. res->t_embd = cur;
  12526. cur = build_lora_mm(model.output, cur);
  12527. cb(cur, "result_output", -1);
  12528. res->t_logits = cur;
  12529. ggml_build_forward_expand(gf, cur);
  12530. }
  12531. };
  12532. struct llm_build_arwkv7 : public llm_build_rwkv7_base {
  12533. llm_build_arwkv7(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv7_base(model, params) {
  12534. GGML_ASSERT(n_embd == hparams.n_embd_r());
  12535. ggml_tensor * cur;
  12536. ggml_tensor * inpL;
  12537. ggml_tensor * v_first = nullptr;
  12538. inpL = build_inp_embd(model.tok_embd);
  12539. auto * rs_inp = build_rs_inp();
  12540. const auto n_embd = hparams.n_embd;
  12541. const auto n_seq_tokens = ubatch.n_seq_tokens;
  12542. const auto n_seqs = ubatch.n_seqs;
  12543. ggml_tensor * inp_out_ids = build_inp_out_ids();
  12544. for (int il = 0; il < n_layer; ++il) {
  12545. const llama_layer * layer = &model.layers[il];
  12546. inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
  12547. ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
  12548. ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
  12549. cb(att_norm, "attn_norm", il);
  12550. ggml_tensor * x_prev = ggml_concat(
  12551. ctx0,
  12552. token_shift,
  12553. ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
  12554. 1
  12555. );
  12556. cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il);
  12557. token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
  12558. ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
  12559. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  12560. cb(ffn_inp, "ffn_inp", il);
  12561. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  12562. ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
  12563. if (il == n_layer - 1 && inp_out_ids) {
  12564. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12565. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  12566. }
  12567. // feed-forward network
  12568. cur = build_norm(ffn_inp,
  12569. model.layers[il].ffn_norm, NULL,
  12570. LLM_NORM_RMS, il);
  12571. cb(cur, "ffn_norm", il);
  12572. cur = build_ffn(cur,
  12573. model.layers[il].ffn_up, NULL, NULL,
  12574. model.layers[il].ffn_gate, NULL, NULL,
  12575. model.layers[il].ffn_down, NULL, NULL,
  12576. NULL,
  12577. LLM_FFN_SILU, LLM_FFN_PAR, il);
  12578. cb(cur, "ffn_out", il);
  12579. cur = ggml_add(ctx0, cur, ffn_inp);
  12580. cur = build_cvec(cur, il);
  12581. cb(cur, "l_out", il);
  12582. // input for next layer
  12583. inpL = cur;
  12584. }
  12585. cur = inpL;
  12586. cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
  12587. cb(cur, "result_norm", -1);
  12588. res->t_embd = cur;
  12589. cur = build_lora_mm(model.output, cur);
  12590. cb(cur, "result_output", -1);
  12591. res->t_logits = cur;
  12592. ggml_build_forward_expand(gf, cur);
  12593. }
  12594. };
  12595. struct llm_build_granite : public llm_graph_context {
  12596. llm_build_granite(
  12597. const llama_model & model,
  12598. const llm_graph_params & params)
  12599. : llm_graph_context(params) {
  12600. const int64_t n_embd_head = hparams.n_embd_head_v;
  12601. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  12602. GGML_ASSERT(n_embd_head == hparams.n_rot);
  12603. ggml_tensor * cur;
  12604. ggml_tensor * inpL;
  12605. inpL = build_inp_embd(model.tok_embd);
  12606. // inp_pos - built only if rope enabled
  12607. ggml_tensor * inp_pos = nullptr;
  12608. if (hparams.rope_finetuned) {
  12609. inp_pos = build_inp_pos();
  12610. }
  12611. auto * inp_attn = build_attn_inp_kv();
  12612. ggml_tensor * inp_out_ids = build_inp_out_ids();
  12613. for (int il = 0; il < n_layer; ++il) {
  12614. ggml_tensor * inpSA = inpL;
  12615. // norm
  12616. cur = build_norm(inpL,
  12617. model.layers[il].attn_norm, NULL,
  12618. LLM_NORM_RMS, il);
  12619. cb(cur, "attn_norm", il);
  12620. // self-attention
  12621. cur = build_attention_layer(
  12622. cur, inp_pos, inp_attn,
  12623. model, n_embd_head, il);
  12624. if (il == n_layer - 1 && inp_out_ids) {
  12625. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12626. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  12627. }
  12628. // ffn
  12629. cur = build_layer_ffn(cur, inpSA, model, il);
  12630. // input for next layer
  12631. inpL = cur;
  12632. }
  12633. cur = inpL;
  12634. cur = build_norm(cur,
  12635. model.output_norm, NULL,
  12636. LLM_NORM_RMS, -1);
  12637. cb(cur, "result_norm", -1);
  12638. res->t_embd = cur;
  12639. // lm_head
  12640. cur = build_lora_mm(model.output, cur);
  12641. // For Granite architectures - scale logits
  12642. cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
  12643. cb(cur, "result_output", -1);
  12644. res->t_logits = cur;
  12645. ggml_build_forward_expand(gf, cur);
  12646. }
  12647. ggml_tensor * build_attention_layer(
  12648. ggml_tensor * cur,
  12649. ggml_tensor * inp_pos,
  12650. llm_graph_input_attn_kv * inp_attn,
  12651. const llama_model & model,
  12652. const int64_t n_embd_head,
  12653. const int il) {
  12654. // compute Q and K and (optionally) RoPE them
  12655. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  12656. cb(Qcur, "Qcur", il);
  12657. if (model.layers[il].bq) {
  12658. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  12659. cb(Qcur, "Qcur", il);
  12660. }
  12661. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  12662. cb(Kcur, "Kcur", il);
  12663. if (model.layers[il].bk) {
  12664. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  12665. cb(Kcur, "Kcur", il);
  12666. }
  12667. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  12668. cb(Vcur, "Vcur", il);
  12669. if (model.layers[il].bv) {
  12670. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  12671. cb(Vcur, "Vcur", il);
  12672. }
  12673. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
  12674. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
  12675. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
  12676. const bool use_rope = hparams.rope_finetuned;
  12677. if (use_rope) {
  12678. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  12679. Qcur = ggml_rope_ext(
  12680. ctx0, Qcur, inp_pos, rope_factors,
  12681. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12682. ext_factor, attn_factor, beta_fast, beta_slow
  12683. );
  12684. Kcur = ggml_rope_ext(
  12685. ctx0, Kcur, inp_pos, rope_factors,
  12686. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12687. ext_factor, attn_factor, beta_fast, beta_slow
  12688. );
  12689. }
  12690. cb(Qcur, "Qcur", il);
  12691. cb(Kcur, "Kcur", il);
  12692. cb(Vcur, "Vcur", il);
  12693. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  12694. cur = build_attn(inp_attn,
  12695. model.layers[il].wo, model.layers[il].bo,
  12696. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  12697. cb(cur, "attn_out", il);
  12698. return cur;
  12699. }
  12700. ggml_tensor * build_layer_ffn(
  12701. ggml_tensor * cur,
  12702. ggml_tensor * inpSA,
  12703. const llama_model & model,
  12704. const int il) {
  12705. // For Granite architectures - scale residual
  12706. if (hparams.f_residual_scale) {
  12707. cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
  12708. }
  12709. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  12710. cb(ffn_inp, "ffn_inp", il);
  12711. // feed-forward network (non-MoE)
  12712. if (model.layers[il].ffn_gate_inp == nullptr) {
  12713. cur = build_norm(ffn_inp,
  12714. model.layers[il].ffn_norm, NULL,
  12715. LLM_NORM_RMS, il);
  12716. cb(cur, "ffn_norm", il);
  12717. cur = build_ffn(cur,
  12718. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  12719. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  12720. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  12721. NULL,
  12722. LLM_FFN_SILU, LLM_FFN_PAR, il);
  12723. cb(cur, "ffn_out", il);
  12724. } else {
  12725. // MoE branch
  12726. cur = build_norm(ffn_inp,
  12727. model.layers[il].ffn_norm, NULL,
  12728. LLM_NORM_RMS, il);
  12729. cb(cur, "ffn_norm", il);
  12730. ggml_tensor * moe_out = build_moe_ffn(cur,
  12731. model.layers[il].ffn_gate_inp,
  12732. model.layers[il].ffn_up_exps,
  12733. model.layers[il].ffn_gate_exps,
  12734. model.layers[il].ffn_down_exps,
  12735. nullptr,
  12736. n_expert, n_expert_used,
  12737. LLM_FFN_SILU, true,
  12738. false, 0.0,
  12739. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  12740. il);
  12741. cb(moe_out, "ffn_moe_out", il);
  12742. // For Granite MoE Shared
  12743. if (hparams.n_ff_shexp > 0) {
  12744. ggml_tensor * ffn_shexp = build_ffn(cur,
  12745. model.layers[il].ffn_up_shexp, NULL, NULL,
  12746. model.layers[il].ffn_gate_shexp, NULL, NULL,
  12747. model.layers[il].ffn_down_shexp, NULL, NULL,
  12748. NULL,
  12749. LLM_FFN_SILU, LLM_FFN_PAR, il);
  12750. cb(ffn_shexp, "ffn_shexp", il);
  12751. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  12752. cb(cur, "ffn_out", il);
  12753. } else {
  12754. cur = moe_out;
  12755. }
  12756. }
  12757. // For Granite architectures - scale residual
  12758. if (hparams.f_residual_scale) {
  12759. cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
  12760. }
  12761. cur = ggml_add(ctx0, cur, ffn_inp);
  12762. cb(cur, "ffn_out", il);
  12763. cur = build_cvec(cur, il);
  12764. cb(cur, "l_out", il);
  12765. return cur;
  12766. }
  12767. };
  12768. struct llm_build_granite_hybrid : public llm_graph_context_mamba {
  12769. llm_build_granite_hybrid(
  12770. const llama_model & model,
  12771. const llm_graph_params & params) :
  12772. llm_graph_context_mamba(params) {
  12773. const int64_t n_embd_head = hparams.n_embd_head_v;
  12774. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  12775. ggml_tensor * cur;
  12776. ggml_tensor * inpL;
  12777. inpL = build_inp_embd(model.tok_embd);
  12778. auto * inp = build_inp_mem_hybrid();
  12779. ggml_tensor * inp_out_ids = build_inp_out_ids();
  12780. // Positional embeddings populated if rope enabled
  12781. ggml_tensor * inp_pos = nullptr;
  12782. if (hparams.rope_finetuned) {
  12783. inp_pos = build_inp_pos();
  12784. }
  12785. for (int il = 0; il < n_layer; ++il) {
  12786. struct ggml_tensor * inpSA = inpL;
  12787. // norm
  12788. cur = build_norm(inpL,
  12789. model.layers[il].attn_norm, NULL,
  12790. LLM_NORM_RMS, il);
  12791. cb(cur, "attn_norm", il);
  12792. if (hparams.is_recurrent(il)) {
  12793. // ssm layer //
  12794. cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
  12795. } else {
  12796. // attention layer //
  12797. cur = build_attention_layer(
  12798. cur, inp_pos, inp->get_attn(), model,
  12799. n_embd_head, il);
  12800. }
  12801. if (il == n_layer - 1 && inp_out_ids) {
  12802. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12803. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  12804. }
  12805. // ffn
  12806. cur = build_layer_ffn(cur, inpSA, model, il);
  12807. // input for next layer
  12808. inpL = cur;
  12809. }
  12810. cur = inpL;
  12811. cur = build_norm(cur,
  12812. model.output_norm, NULL,
  12813. LLM_NORM_RMS, -1);
  12814. cb(cur, "result_norm", -1);
  12815. res->t_embd = cur;
  12816. // lm_head
  12817. cur = build_lora_mm(model.output, cur);
  12818. // For Granite architectures - scale logits
  12819. if (hparams.f_logit_scale) {
  12820. cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
  12821. }
  12822. cb(cur, "result_output", -1);
  12823. res->t_logits = cur;
  12824. ggml_build_forward_expand(gf, cur);
  12825. }
  12826. ggml_tensor * build_attention_layer(
  12827. ggml_tensor * cur,
  12828. ggml_tensor * inp_pos,
  12829. llm_graph_input_attn_kv * inp_attn,
  12830. const llama_model & model,
  12831. const int64_t n_embd_head,
  12832. const int il) {
  12833. // compute Q and K and (optionally) RoPE them
  12834. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  12835. cb(Qcur, "Qcur", il);
  12836. if (model.layers[il].bq) {
  12837. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  12838. cb(Qcur, "Qcur", il);
  12839. }
  12840. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  12841. cb(Kcur, "Kcur", il);
  12842. if (model.layers[il].bk) {
  12843. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  12844. cb(Kcur, "Kcur", il);
  12845. }
  12846. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  12847. cb(Vcur, "Vcur", il);
  12848. if (model.layers[il].bv) {
  12849. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  12850. cb(Vcur, "Vcur", il);
  12851. }
  12852. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
  12853. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
  12854. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
  12855. const bool use_rope = hparams.rope_finetuned;
  12856. if (use_rope) {
  12857. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  12858. Qcur = ggml_rope_ext(
  12859. ctx0, Qcur, inp_pos, rope_factors,
  12860. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12861. ext_factor, attn_factor, beta_fast, beta_slow
  12862. );
  12863. Kcur = ggml_rope_ext(
  12864. ctx0, Kcur, inp_pos, rope_factors,
  12865. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12866. ext_factor, attn_factor, beta_fast, beta_slow
  12867. );
  12868. }
  12869. cb(Qcur, "Qcur", il);
  12870. cb(Kcur, "Kcur", il);
  12871. cb(Vcur, "Vcur", il);
  12872. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  12873. cur = build_attn(inp_attn,
  12874. model.layers[il].wo, model.layers[il].bo,
  12875. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  12876. cb(cur, "attn_out", il);
  12877. return cur;
  12878. }
  12879. ggml_tensor * build_layer_ffn(
  12880. ggml_tensor * cur,
  12881. ggml_tensor * inpSA,
  12882. const llama_model & model,
  12883. const int il) {
  12884. // For Granite architectures - scale residual
  12885. if (hparams.f_residual_scale) {
  12886. cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
  12887. }
  12888. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  12889. cb(ffn_inp, "ffn_inp", il);
  12890. // feed-forward network (non-MoE)
  12891. if (model.layers[il].ffn_gate_inp == nullptr) {
  12892. cur = build_norm(ffn_inp,
  12893. model.layers[il].ffn_norm, NULL,
  12894. LLM_NORM_RMS, il);
  12895. cb(cur, "ffn_norm", il);
  12896. cur = build_ffn(cur,
  12897. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  12898. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  12899. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  12900. NULL,
  12901. LLM_FFN_SILU, LLM_FFN_PAR, il);
  12902. cb(cur, "ffn_out", il);
  12903. } else {
  12904. // MoE branch
  12905. cur = build_norm(ffn_inp,
  12906. model.layers[il].ffn_norm, NULL,
  12907. LLM_NORM_RMS, il);
  12908. cb(cur, "ffn_norm", il);
  12909. ggml_tensor * moe_out = build_moe_ffn(cur,
  12910. model.layers[il].ffn_gate_inp,
  12911. model.layers[il].ffn_up_exps,
  12912. model.layers[il].ffn_gate_exps,
  12913. model.layers[il].ffn_down_exps,
  12914. nullptr,
  12915. n_expert, n_expert_used,
  12916. LLM_FFN_SILU, true,
  12917. false, 0.0,
  12918. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  12919. il);
  12920. cb(moe_out, "ffn_moe_out", il);
  12921. // For Granite MoE Shared
  12922. if (hparams.n_ff_shexp > 0) {
  12923. ggml_tensor * ffn_shexp = build_ffn(cur,
  12924. model.layers[il].ffn_up_shexp, NULL, NULL,
  12925. model.layers[il].ffn_gate_shexp, NULL, NULL,
  12926. model.layers[il].ffn_down_shexp, NULL, NULL,
  12927. NULL,
  12928. LLM_FFN_SILU, LLM_FFN_PAR, il);
  12929. cb(ffn_shexp, "ffn_shexp", il);
  12930. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  12931. cb(cur, "ffn_out", il);
  12932. } else {
  12933. cur = moe_out;
  12934. }
  12935. }
  12936. // For Granite architectures - scale residual
  12937. if (hparams.f_residual_scale) {
  12938. cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
  12939. }
  12940. cur = ggml_add(ctx0, cur, ffn_inp);
  12941. cb(cur, "ffn_out", il);
  12942. cur = build_cvec(cur, il);
  12943. cb(cur, "l_out", il);
  12944. return cur;
  12945. }
  12946. };
  12947. // ref: https://github.com/facebookresearch/chameleon
  12948. // based on the original build_llama() function, changes:
  12949. // * qk-norm
  12950. // * swin-norm
  12951. // * removed bias
  12952. // * removed MoE
  12953. struct llm_build_chameleon : public llm_graph_context {
  12954. llm_build_chameleon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  12955. const int64_t n_embd_head = hparams.n_embd_head_v;
  12956. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  12957. GGML_ASSERT(n_embd_head == hparams.n_rot);
  12958. ggml_tensor * cur;
  12959. ggml_tensor * inpL;
  12960. inpL = build_inp_embd(model.tok_embd);
  12961. // inp_pos - contains the positions
  12962. ggml_tensor * inp_pos = build_inp_pos();
  12963. auto * inp_attn = build_attn_inp_kv();
  12964. ggml_tensor * inp_out_ids = build_inp_out_ids();
  12965. for (int il = 0; il < n_layer; ++il) {
  12966. ggml_tensor * inpSA = inpL;
  12967. // norm
  12968. if (hparams.swin_norm) {
  12969. cur = inpL;
  12970. } else {
  12971. cur = build_norm(inpL,
  12972. model.layers[il].attn_norm, NULL,
  12973. LLM_NORM_RMS, il);
  12974. cb(cur, "attn_norm", il);
  12975. }
  12976. // self-attention
  12977. {
  12978. // compute Q and K and RoPE them
  12979. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  12980. cb(Qcur, "Qcur", il);
  12981. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  12982. cb(Kcur, "Kcur", il);
  12983. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  12984. cb(Vcur, "Vcur", il);
  12985. if (model.layers[il].attn_q_norm) {
  12986. Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
  12987. ggml_element_size(Qcur) * n_embd_head,
  12988. ggml_element_size(Qcur) * n_embd_head * n_head,
  12989. 0);
  12990. cb(Qcur, "Qcur", il);
  12991. Qcur = build_norm(Qcur,
  12992. model.layers[il].attn_q_norm,
  12993. model.layers[il].attn_q_norm_b,
  12994. LLM_NORM, il);
  12995. cb(Qcur, "Qcur", il);
  12996. }
  12997. if (model.layers[il].attn_k_norm) {
  12998. Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
  12999. ggml_element_size(Kcur) * n_embd_head,
  13000. ggml_element_size(Kcur) * n_embd_head * n_head_kv,
  13001. 0);
  13002. cb(Kcur, "Kcur", il);
  13003. Kcur = build_norm(Kcur,
  13004. model.layers[il].attn_k_norm,
  13005. model.layers[il].attn_k_norm_b,
  13006. LLM_NORM, il);
  13007. cb(Kcur, "Kcur", il);
  13008. }
  13009. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  13010. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  13011. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  13012. Qcur = ggml_rope_ext(
  13013. ctx0, Qcur, inp_pos, nullptr,
  13014. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13015. ext_factor, attn_factor, beta_fast, beta_slow
  13016. );
  13017. Kcur = ggml_rope_ext(
  13018. ctx0, Kcur, inp_pos, nullptr,
  13019. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13020. ext_factor, attn_factor, beta_fast, beta_slow
  13021. );
  13022. cb(Qcur, "Qcur", il);
  13023. cb(Kcur, "Kcur", il);
  13024. cb(Vcur, "Vcur", il);
  13025. cur = build_attn(inp_attn,
  13026. model.layers[il].wo, nullptr,
  13027. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  13028. }
  13029. if (il == n_layer - 1 && inp_out_ids) {
  13030. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  13031. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  13032. }
  13033. if (hparams.swin_norm) {
  13034. cur = build_norm(cur,
  13035. model.layers[il].attn_norm, NULL,
  13036. LLM_NORM_RMS, il);
  13037. }
  13038. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  13039. cb(ffn_inp, "ffn_inp", il);
  13040. // feed-forward network
  13041. if (!hparams.swin_norm) {
  13042. cur = build_norm(ffn_inp,
  13043. model.layers[il].ffn_norm, NULL,
  13044. LLM_NORM_RMS, il);
  13045. cb(cur, "ffn_norm", il);
  13046. }
  13047. cur = build_ffn(cur,
  13048. model.layers[il].ffn_up, NULL, NULL,
  13049. model.layers[il].ffn_gate, NULL, NULL,
  13050. model.layers[il].ffn_down, NULL, NULL,
  13051. NULL,
  13052. LLM_FFN_SILU, LLM_FFN_PAR, il);
  13053. cb(cur, "ffn_out", il);
  13054. if (hparams.swin_norm) {
  13055. cur = build_norm(cur,
  13056. model.layers[il].ffn_norm, NULL,
  13057. LLM_NORM_RMS, il);
  13058. cb(cur, "ffn_norm", il);
  13059. }
  13060. cur = ggml_add(ctx0, cur, ffn_inp);
  13061. cb(cur, "ffn_out", il);
  13062. cur = build_cvec(cur, il);
  13063. cb(cur, "l_out", il);
  13064. // input for next layer
  13065. inpL = cur;
  13066. }
  13067. cur = inpL;
  13068. cur = build_norm(cur,
  13069. model.output_norm, NULL,
  13070. LLM_NORM_RMS, -1);
  13071. cb(cur, "result_norm", -1);
  13072. res->t_embd = cur;
  13073. // lm_head
  13074. cur = build_lora_mm(model.output, cur);
  13075. cb(cur, "result_output_with_img_logits", -1);
  13076. // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
  13077. // Needs to be removed once image outputs are supported.
  13078. int img_token_end_idx = 8196;
  13079. int img_token_start_idx = 4;
  13080. int num_img_tokens = img_token_end_idx - img_token_start_idx;
  13081. // creates 1d tensor of size num_img_tokens and values -FLT_MAX,
  13082. // which ensures that text token values are always at least larger than image token values
  13083. ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
  13084. img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
  13085. cb(img_logits, "img_logits", -1);
  13086. cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
  13087. cb(cur, "result_output", -1);
  13088. res->t_logits = cur;
  13089. ggml_build_forward_expand(gf, cur);
  13090. }
  13091. };
  13092. struct llm_build_wavtokenizer_dec : public llm_graph_context {
  13093. llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  13094. ggml_tensor * cur;
  13095. ggml_tensor * inpL;
  13096. inpL = build_inp_embd(model.tok_embd);
  13097. cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
  13098. cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
  13099. cur = ggml_add(ctx0, cur, model.conv1d_b);
  13100. // posnet
  13101. for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
  13102. const auto & layer = model.layers[il].posnet;
  13103. inpL = cur;
  13104. switch (il) {
  13105. case 0:
  13106. case 1:
  13107. case 3:
  13108. case 4:
  13109. {
  13110. cur = build_norm(cur,
  13111. layer.norm1,
  13112. layer.norm1_b,
  13113. LLM_NORM_GROUP, 0);
  13114. cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
  13115. cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
  13116. cur = ggml_add(ctx0, cur, layer.conv1_b);
  13117. cur = build_norm(cur,
  13118. layer.norm2,
  13119. layer.norm2_b,
  13120. LLM_NORM_GROUP, 0);
  13121. cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
  13122. cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
  13123. cur = ggml_add(ctx0, cur, layer.conv2_b);
  13124. cur = ggml_add(ctx0, cur, inpL);
  13125. } break;
  13126. case 2:
  13127. {
  13128. cur = build_norm(cur,
  13129. layer.attn_norm,
  13130. layer.attn_norm_b,
  13131. LLM_NORM_GROUP, 0);
  13132. ggml_tensor * q;
  13133. ggml_tensor * k;
  13134. ggml_tensor * v;
  13135. q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
  13136. k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
  13137. v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
  13138. q = ggml_add(ctx0, q, layer.attn_q_b);
  13139. k = ggml_add(ctx0, k, layer.attn_k_b);
  13140. v = ggml_add(ctx0, v, layer.attn_v_b);
  13141. q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
  13142. k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
  13143. ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
  13144. kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
  13145. cur = ggml_mul_mat(ctx0, kq, v);
  13146. cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
  13147. cur = ggml_add(ctx0, cur, layer.attn_o_b);
  13148. cur = ggml_add(ctx0, cur, inpL);
  13149. } break;
  13150. case 5:
  13151. {
  13152. cur = build_norm(cur,
  13153. layer.norm,
  13154. layer.norm_b,
  13155. LLM_NORM_GROUP, 0);
  13156. } break;
  13157. default: GGML_ABORT("unknown posnet layer");
  13158. };
  13159. }
  13160. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  13161. cur = build_norm(cur,
  13162. model.tok_norm,
  13163. model.tok_norm_b,
  13164. LLM_NORM, -1);
  13165. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  13166. inpL = cur;
  13167. // convnext
  13168. for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
  13169. const auto & layer = model.layers[il].convnext;
  13170. cur = inpL;
  13171. cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
  13172. cur = ggml_add(ctx0, cur, layer.dw_b);
  13173. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  13174. cur = build_norm(cur,
  13175. layer.norm,
  13176. layer.norm_b,
  13177. LLM_NORM, -1);
  13178. cur = build_ffn(cur,
  13179. layer.pw1, layer.pw1_b, NULL,
  13180. NULL, NULL, NULL,
  13181. layer.pw2, layer.pw2_b, NULL,
  13182. NULL,
  13183. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  13184. cur = ggml_mul(ctx0, cur, layer.gamma);
  13185. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  13186. inpL = ggml_add(ctx0, cur, inpL);
  13187. }
  13188. cur = inpL;
  13189. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  13190. cur = build_norm(cur,
  13191. model.output_norm,
  13192. model.output_norm_b,
  13193. LLM_NORM, -1);
  13194. // lm_head
  13195. cur = build_lora_mm(model.output, cur);
  13196. cur = ggml_add(ctx0, cur, model.output_b);
  13197. cb(cur, "result_embd", -1);
  13198. res->t_embd = cur;
  13199. ggml_build_forward_expand(gf, cur);
  13200. }
  13201. };
  13202. struct llm_build_plm : public llm_graph_context {
  13203. llm_build_plm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  13204. const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
  13205. const uint32_t n_embd_head_qk_rope = hparams.n_rot;
  13206. const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  13207. const uint32_t kv_lora_rank = hparams.n_lora_kv;
  13208. ggml_tensor * cur;
  13209. ggml_tensor * inpL;
  13210. // {n_embd, n_tokens}
  13211. inpL = build_inp_embd(model.tok_embd);
  13212. // inp_pos - contains the positions
  13213. ggml_tensor * inp_pos = build_inp_pos();
  13214. auto * inp_attn = build_attn_inp_kv();
  13215. ggml_tensor * inp_out_ids = build_inp_out_ids();
  13216. for (int il = 0; il < n_layer; ++il) {
  13217. ggml_tensor * inpSA = inpL;
  13218. // norm
  13219. cur = build_norm(inpL,
  13220. model.layers[il].attn_norm, NULL,
  13221. LLM_NORM_RMS, il);
  13222. cb(cur, "attn_norm", il);
  13223. // self_attention
  13224. {
  13225. ggml_tensor * q = NULL;
  13226. q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
  13227. cb(q, "q", il);
  13228. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  13229. ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
  13230. ggml_row_size(q->type, hparams.n_embd_head_k),
  13231. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  13232. 0);
  13233. cb(q_nope, "q_nope", il);
  13234. // and {n_head * n_embd_head_qk_rope, n_tokens}
  13235. ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
  13236. ggml_row_size(q->type, hparams.n_embd_head_k),
  13237. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  13238. ggml_row_size(q->type, n_embd_head_qk_nope));
  13239. cb(q_pe, "q_pe", il);
  13240. // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
  13241. ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
  13242. cb(kv_pe_compresseed, "kv_pe_compresseed", il);
  13243. // split into {kv_lora_rank, n_tokens}
  13244. ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
  13245. kv_pe_compresseed->nb[1],
  13246. 0);
  13247. cb(kv_compressed, "kv_compressed", il);
  13248. // and {n_embd_head_qk_rope, n_tokens}
  13249. ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
  13250. kv_pe_compresseed->nb[1],
  13251. kv_pe_compresseed->nb[1],
  13252. ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
  13253. cb(k_pe, "k_pe", il);
  13254. kv_compressed = build_norm(kv_compressed,
  13255. model.layers[il].attn_kv_a_norm, NULL,
  13256. LLM_NORM_RMS, il);
  13257. cb(kv_compressed, "kv_compressed", il);
  13258. // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
  13259. ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
  13260. cb(kv, "kv", il);
  13261. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  13262. ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
  13263. ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
  13264. ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  13265. 0);
  13266. cb(k_nope, "k_nope", il);
  13267. // and {n_head * n_embd_head_v, n_tokens}
  13268. ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
  13269. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  13270. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
  13271. ggml_row_size(kv->type, (n_embd_head_qk_nope)));
  13272. cb(v_states, "v_states", il);
  13273. v_states = ggml_cont(ctx0, v_states);
  13274. cb(v_states, "v_states", il);
  13275. v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
  13276. ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
  13277. 0);
  13278. cb(v_states, "v_states", il);
  13279. q_pe = ggml_rope_ext(
  13280. ctx0, q_pe, inp_pos, nullptr,
  13281. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13282. ext_factor, attn_factor, beta_fast, beta_slow
  13283. );
  13284. cb(q_pe, "q_pe", il);
  13285. // shared RoPE key
  13286. k_pe = ggml_rope_ext(
  13287. ctx0, k_pe, inp_pos, nullptr,
  13288. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13289. ext_factor, attn_factor, beta_fast, beta_slow
  13290. );
  13291. cb(k_pe, "k_pe", il);
  13292. ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
  13293. cb(q_states, "q_states", il);
  13294. ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
  13295. cb(k_states, "k_states", il);
  13296. cur = build_attn(inp_attn,
  13297. model.layers[il].wo, NULL,
  13298. q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
  13299. }
  13300. if (il == n_layer - 1 && inp_out_ids) {
  13301. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  13302. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  13303. }
  13304. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  13305. cb(ffn_inp, "ffn_inp", il);
  13306. cur = build_norm(ffn_inp,
  13307. model.layers[il].ffn_norm, NULL,
  13308. LLM_NORM_RMS, il);
  13309. cb(cur, "ffn_norm", il);
  13310. cur = build_ffn(cur,
  13311. model.layers[il].ffn_up, NULL, NULL,
  13312. NULL, NULL, NULL,
  13313. model.layers[il].ffn_down, NULL, NULL,
  13314. NULL,
  13315. LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
  13316. cb(cur, "ffn_out", il);
  13317. cur = ggml_add(ctx0, cur, ffn_inp);
  13318. cur = build_cvec(cur, il);
  13319. cb(cur, "l_out", il);
  13320. // input for next layer
  13321. inpL = cur;
  13322. }
  13323. cur = inpL;
  13324. cur = build_norm(cur,
  13325. model.output_norm, NULL,
  13326. LLM_NORM_RMS, -1);
  13327. cb(cur, "result_norm", -1);
  13328. res->t_embd = cur;
  13329. cur = build_lora_mm(model.output, cur);
  13330. cb(cur, "result_output", -1);
  13331. res->t_logits = cur;
  13332. ggml_build_forward_expand(gf, cur);
  13333. }
  13334. };
  13335. struct llm_build_bailingmoe : public llm_graph_context {
  13336. llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  13337. ggml_tensor * cur;
  13338. ggml_tensor * inpL;
  13339. inpL = build_inp_embd(model.tok_embd);
  13340. // inp_pos - contains the positions
  13341. ggml_tensor * inp_pos = build_inp_pos();
  13342. auto * inp_attn = build_attn_inp_kv();
  13343. ggml_tensor * inp_out_ids = build_inp_out_ids();
  13344. for (int il = 0; il < n_layer; ++il) {
  13345. ggml_tensor * inpSA = inpL;
  13346. // norm
  13347. cur = build_norm(inpL,
  13348. model.layers[il].attn_norm, NULL,
  13349. LLM_NORM_RMS, il);
  13350. cb(cur, "attn_norm", il);
  13351. // self-attention
  13352. {
  13353. // rope freq factors for llama3; may return nullptr for llama2 and other models
  13354. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  13355. // compute Q and K and RoPE them
  13356. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  13357. cb(Qcur, "Qcur", il);
  13358. if (model.layers[il].bq) {
  13359. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  13360. cb(Qcur, "Qcur", il);
  13361. }
  13362. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  13363. cb(Kcur, "Kcur", il);
  13364. if (model.layers[il].bk) {
  13365. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  13366. cb(Kcur, "Kcur", il);
  13367. }
  13368. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  13369. cb(Vcur, "Vcur", il);
  13370. if (model.layers[il].bv) {
  13371. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  13372. cb(Vcur, "Vcur", il);
  13373. }
  13374. Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
  13375. Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
  13376. Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
  13377. Qcur = ggml_rope_ext(
  13378. ctx0, Qcur, inp_pos, rope_factors,
  13379. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13380. ext_factor, attn_factor, beta_fast, beta_slow
  13381. );
  13382. Kcur = ggml_rope_ext(
  13383. ctx0, Kcur, inp_pos, rope_factors,
  13384. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13385. ext_factor, attn_factor, beta_fast, beta_slow
  13386. );
  13387. cb(Qcur, "Qcur", il);
  13388. cb(Kcur, "Kcur", il);
  13389. cb(Vcur, "Vcur", il);
  13390. cur = build_attn(inp_attn,
  13391. model.layers[il].wo, model.layers[il].bo,
  13392. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
  13393. }
  13394. if (il == n_layer - 1 && inp_out_ids) {
  13395. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  13396. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  13397. }
  13398. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  13399. cb(ffn_inp, "ffn_inp", il);
  13400. cur = build_norm(ffn_inp,
  13401. model.layers[il].ffn_norm, NULL,
  13402. LLM_NORM_RMS, il);
  13403. cb(cur, "ffn_norm", il);
  13404. ggml_tensor * moe_out =
  13405. build_moe_ffn(cur,
  13406. model.layers[il].ffn_gate_inp,
  13407. model.layers[il].ffn_up_exps,
  13408. model.layers[il].ffn_gate_exps,
  13409. model.layers[il].ffn_down_exps,
  13410. nullptr,
  13411. n_expert, n_expert_used,
  13412. LLM_FFN_SILU, hparams.expert_weights_norm,
  13413. false, hparams.expert_weights_scale,
  13414. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  13415. il);
  13416. cb(moe_out, "ffn_moe_out", il);
  13417. // FFN shared expert
  13418. {
  13419. ggml_tensor * ffn_shexp = build_ffn(cur,
  13420. model.layers[il].ffn_up_shexp, NULL, NULL,
  13421. model.layers[il].ffn_gate_shexp, NULL, NULL,
  13422. model.layers[il].ffn_down_shexp, NULL, NULL,
  13423. NULL,
  13424. LLM_FFN_SILU, LLM_FFN_PAR, il);
  13425. cb(ffn_shexp, "ffn_shexp", il);
  13426. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  13427. cb(cur, "ffn_out", il);
  13428. }
  13429. cur = ggml_add(ctx0, cur, ffn_inp);
  13430. cur = build_cvec(cur, il);
  13431. cb(cur, "l_out", il);
  13432. // input for next layer
  13433. inpL = cur;
  13434. }
  13435. cur = inpL;
  13436. cur = build_norm(cur,
  13437. model.output_norm, NULL,
  13438. LLM_NORM_RMS, -1);
  13439. cb(cur, "result_norm", -1);
  13440. res->t_embd = cur;
  13441. // lm_head
  13442. cur = build_lora_mm(model.output, cur);
  13443. cb(cur, "result_output", -1);
  13444. res->t_logits = cur;
  13445. ggml_build_forward_expand(gf, cur);
  13446. }
  13447. };
  13448. struct llm_build_dots1 : public llm_graph_context {
  13449. llm_build_dots1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  13450. const int64_t n_embd_head = hparams.n_embd_head_v;
  13451. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  13452. GGML_ASSERT(n_embd_head == hparams.n_rot);
  13453. ggml_tensor * cur;
  13454. ggml_tensor * inpL;
  13455. inpL = build_inp_embd(model.tok_embd);
  13456. // inp_pos - contains the positions
  13457. ggml_tensor * inp_pos = build_inp_pos();
  13458. auto * inp_attn = build_attn_inp_kv();
  13459. ggml_tensor * inp_out_ids = build_inp_out_ids();
  13460. for (int il = 0; il < n_layer; ++il) {
  13461. ggml_tensor * inpSA = inpL;
  13462. // norm
  13463. cur = build_norm(inpL,
  13464. model.layers[il].attn_norm, NULL,
  13465. LLM_NORM_RMS, il);
  13466. cb(cur, "attn_norm", il);
  13467. // self_attention
  13468. {
  13469. // compute Q and K and RoPE them
  13470. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  13471. cb(Qcur, "Qcur", il);
  13472. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  13473. cb(Kcur, "Kcur", il);
  13474. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  13475. cb(Vcur, "Vcur", il);
  13476. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  13477. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  13478. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  13479. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  13480. cb(Qcur, "Qcur_normed", il);
  13481. Qcur = ggml_rope_ext(
  13482. ctx0, Qcur, inp_pos, nullptr,
  13483. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13484. ext_factor, attn_factor, beta_fast, beta_slow
  13485. );
  13486. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  13487. cb(Kcur, "Kcur_normed", il);
  13488. Kcur = ggml_rope_ext(
  13489. ctx0, Kcur, inp_pos, nullptr,
  13490. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13491. ext_factor, attn_factor, beta_fast, beta_slow
  13492. );
  13493. cb(Qcur, "Qcur", il);
  13494. cb(Kcur, "Kcur", il);
  13495. cb(Vcur, "Vcur", il);
  13496. cur = build_attn(inp_attn,
  13497. model.layers[il].wo, model.layers[il].bo,
  13498. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  13499. }
  13500. if (il == n_layer - 1 && inp_out_ids) {
  13501. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  13502. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  13503. }
  13504. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  13505. cb(ffn_inp, "ffn_inp", il);
  13506. // MoE branch
  13507. cur = build_norm(ffn_inp,
  13508. model.layers[il].ffn_norm, NULL,
  13509. LLM_NORM_RMS, il);
  13510. cb(cur, "ffn_norm", il);
  13511. if ((uint32_t) il < hparams.n_layer_dense_lead) {
  13512. cur = build_ffn(cur,
  13513. model.layers[il].ffn_up, NULL, NULL,
  13514. model.layers[il].ffn_gate, NULL, NULL,
  13515. model.layers[il].ffn_down, NULL, NULL,
  13516. NULL,
  13517. LLM_FFN_SILU, LLM_FFN_PAR, il);
  13518. cb(cur, "ffn_out", il);
  13519. } else {
  13520. ggml_tensor * moe_out =
  13521. build_moe_ffn(cur,
  13522. model.layers[il].ffn_gate_inp,
  13523. model.layers[il].ffn_up_exps,
  13524. model.layers[il].ffn_gate_exps,
  13525. model.layers[il].ffn_down_exps,
  13526. model.layers[il].ffn_exp_probs_b,
  13527. n_expert, n_expert_used,
  13528. LLM_FFN_SILU, hparams.expert_weights_norm,
  13529. true, hparams.expert_weights_scale,
  13530. (llama_expert_gating_func_type) hparams.expert_gating_func,
  13531. il);
  13532. cb(moe_out, "ffn_moe_out", il);
  13533. {
  13534. ggml_tensor * ffn_shexp = build_ffn(cur,
  13535. model.layers[il].ffn_up_shexp, NULL, NULL,
  13536. model.layers[il].ffn_gate_shexp, NULL, NULL,
  13537. model.layers[il].ffn_down_shexp, NULL, NULL,
  13538. NULL,
  13539. LLM_FFN_SILU, LLM_FFN_PAR, il);
  13540. cb(ffn_shexp, "ffn_shexp", il);
  13541. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  13542. cb(cur, "ffn_out", il);
  13543. }
  13544. }
  13545. cur = ggml_add(ctx0, cur, ffn_inp);
  13546. cur = build_cvec(cur, il);
  13547. cb(cur, "l_out", il);
  13548. // input for next layer
  13549. inpL = cur;
  13550. }
  13551. cur = inpL;
  13552. cur = build_norm(cur,
  13553. model.output_norm, NULL,
  13554. LLM_NORM_RMS, -1);
  13555. cb(cur, "result_norm", -1);
  13556. res->t_embd = cur;
  13557. // lm_head
  13558. cur = build_lora_mm(model.output, cur);
  13559. cb(cur, "result_output", -1);
  13560. res->t_logits = cur;
  13561. ggml_build_forward_expand(gf, cur);
  13562. }
  13563. };
  13564. struct llm_build_ernie4_5 : public llm_graph_context {
  13565. llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  13566. const int64_t n_embd_head = hparams.n_embd_head_v;
  13567. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  13568. GGML_ASSERT(n_embd_head == hparams.n_rot);
  13569. ggml_tensor * cur;
  13570. ggml_tensor * inpL;
  13571. inpL = build_inp_embd(model.tok_embd);
  13572. // inp_pos - contains the positions
  13573. ggml_tensor * inp_pos = build_inp_pos();
  13574. auto * inp_attn = build_attn_inp_kv();
  13575. for (int il = 0; il < n_layer; ++il) {
  13576. ggml_tensor * inpSA = inpL;
  13577. // norm
  13578. {
  13579. cur = build_norm(inpL,
  13580. model.layers[il].attn_norm, NULL,
  13581. LLM_NORM_RMS, il);
  13582. cb(cur, "attn_norm", il);
  13583. }
  13584. // self-attention
  13585. {
  13586. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  13587. cb(Qcur, "Qcur", il);
  13588. if (model.layers[il].bq) {
  13589. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  13590. cb(Qcur, "Qcur", il);
  13591. }
  13592. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  13593. cb(Kcur, "Kcur", il);
  13594. if (model.layers[il].bk) {
  13595. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  13596. cb(Kcur, "Kcur", il);
  13597. }
  13598. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  13599. cb(Vcur, "Vcur", il);
  13600. if (model.layers[il].bv) {
  13601. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  13602. cb(Vcur, "Vcur", il);
  13603. }
  13604. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  13605. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  13606. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  13607. Qcur = ggml_rope_ext(
  13608. ctx0, Qcur, inp_pos, nullptr,
  13609. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13610. ext_factor, attn_factor, beta_fast, beta_slow
  13611. );
  13612. Kcur = ggml_rope_ext(
  13613. ctx0, Kcur, inp_pos, nullptr,
  13614. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13615. ext_factor, attn_factor, beta_fast, beta_slow
  13616. );
  13617. cb(Qcur, "Qcur", il);
  13618. cb(Kcur, "Kcur", il);
  13619. cb(Vcur, "Vcur", il);
  13620. cur = build_attn(inp_attn,
  13621. model.layers[il].wo, NULL,
  13622. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  13623. }
  13624. if (il == n_layer - 1) {
  13625. // skip computing output for unused tokens
  13626. ggml_tensor * inp_out_ids = build_inp_out_ids();
  13627. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  13628. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  13629. }
  13630. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  13631. cb(ffn_inp, "ffn_inp", il);
  13632. // feed-forward network
  13633. {
  13634. cur = build_norm(ffn_inp,
  13635. model.layers[il].ffn_norm, NULL,
  13636. LLM_NORM_RMS, il);
  13637. cb(cur, "ffn_norm", il);
  13638. cur = build_ffn(cur,
  13639. model.layers[il].ffn_up, NULL, NULL,
  13640. model.layers[il].ffn_gate, NULL, NULL,
  13641. model.layers[il].ffn_down, NULL, NULL,
  13642. NULL,
  13643. LLM_FFN_SILU, LLM_FFN_PAR, il);
  13644. cb(cur, "ffn_out", il);
  13645. }
  13646. cur = ggml_add(ctx0, cur, ffn_inp);
  13647. cur = build_cvec(cur, il);
  13648. cb(cur, "l_out", il);
  13649. // input for next layer
  13650. inpL = cur;
  13651. }
  13652. cur = inpL;
  13653. cur = build_norm(cur,
  13654. model.output_norm, NULL,
  13655. LLM_NORM_RMS, -1);
  13656. cb(cur, "result_norm", -1);
  13657. res->t_embd = cur;
  13658. // lm_head
  13659. cur = build_lora_mm(model.output, cur);
  13660. cb(cur, "result_output", -1);
  13661. res->t_logits = cur;
  13662. ggml_build_forward_expand(gf, cur);
  13663. }
  13664. };
  13665. struct llm_build_ernie4_5_moe : public llm_graph_context {
  13666. llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  13667. const int64_t n_embd_head = hparams.n_embd_head_v;
  13668. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  13669. GGML_ASSERT(n_embd_head == hparams.n_rot);
  13670. ggml_tensor * cur;
  13671. ggml_tensor * inpL;
  13672. inpL = build_inp_embd(model.tok_embd);
  13673. // inp_pos - contains the positions
  13674. ggml_tensor * inp_pos = build_inp_pos();
  13675. auto * inp_attn = build_attn_inp_kv();
  13676. ggml_tensor * inp_out_ids = build_inp_out_ids();
  13677. GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Ernie 4.5 MoE requires n_moe_layer_step > 0");
  13678. for (int il = 0; il < n_layer; ++il) {
  13679. ggml_tensor * inpSA = inpL;
  13680. // norm
  13681. {
  13682. cur = build_norm(inpL,
  13683. model.layers[il].attn_norm, NULL,
  13684. LLM_NORM_RMS, il);
  13685. cb(cur, "attn_norm", il);
  13686. }
  13687. // self-attention
  13688. {
  13689. // compute Q and K and RoPE them
  13690. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  13691. cb(Qcur, "Qcur", il);
  13692. if (model.layers[il].bq) {
  13693. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  13694. cb(Qcur, "Qcur", il);
  13695. }
  13696. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  13697. cb(Kcur, "Kcur", il);
  13698. if (model.layers[il].bk) {
  13699. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  13700. cb(Kcur, "Kcur", il);
  13701. }
  13702. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  13703. cb(Vcur, "Vcur", il);
  13704. if (model.layers[il].bv) {
  13705. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  13706. cb(Vcur, "Vcur", il);
  13707. }
  13708. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  13709. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  13710. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  13711. Qcur = ggml_rope_ext(
  13712. ctx0, Qcur, inp_pos, nullptr,
  13713. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13714. ext_factor, attn_factor, beta_fast, beta_slow
  13715. );
  13716. Kcur = ggml_rope_ext(
  13717. ctx0, Kcur, inp_pos, nullptr,
  13718. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13719. ext_factor, attn_factor, beta_fast, beta_slow
  13720. );
  13721. cb(Qcur, "Qcur", il);
  13722. cb(Kcur, "Kcur", il);
  13723. cb(Vcur, "Vcur", il);
  13724. cur = build_attn(inp_attn,
  13725. model.layers[il].wo, NULL,
  13726. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  13727. cb(cur, "attn_out", il);
  13728. }
  13729. if (il == n_layer - 1 && inp_out_ids) {
  13730. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  13731. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  13732. }
  13733. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  13734. cb(ffn_inp, "ffn_inp", il);
  13735. // feed-forward network
  13736. bool is_moe_layer = static_cast<uint32_t>(il) >= hparams.n_layer_dense_lead && (il + 1) % hparams.n_moe_layer_step == 0;
  13737. if (!is_moe_layer) {
  13738. cur = build_norm(ffn_inp,
  13739. model.layers[il].ffn_norm, NULL,
  13740. LLM_NORM_RMS, il);
  13741. cb(cur, "ffn_norm", il);
  13742. cur = build_ffn(cur,
  13743. model.layers[il].ffn_up, NULL, NULL,
  13744. model.layers[il].ffn_gate, NULL, NULL,
  13745. model.layers[il].ffn_down, NULL, NULL,
  13746. NULL,
  13747. LLM_FFN_SILU, LLM_FFN_PAR, il);
  13748. cb(cur, "ffn_out", il);
  13749. } else {
  13750. // MoE branch
  13751. cur = build_norm(ffn_inp,
  13752. model.layers[il].ffn_norm, NULL,
  13753. LLM_NORM_RMS, il);
  13754. cb(cur, "ffn_norm", il);
  13755. ggml_tensor * moe_out = build_moe_ffn(cur,
  13756. model.layers[il].ffn_gate_inp,
  13757. model.layers[il].ffn_up_exps,
  13758. model.layers[il].ffn_gate_exps,
  13759. model.layers[il].ffn_down_exps,
  13760. model.layers[il].ffn_exp_probs_b,
  13761. n_expert, n_expert_used,
  13762. LLM_FFN_SILU, true,
  13763. false, 0.0,
  13764. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  13765. il);
  13766. cb(moe_out, "ffn_moe_out", il);
  13767. // Shared expert (if present)
  13768. if (hparams.n_ff_shexp > 0) {
  13769. ggml_tensor * ffn_shexp = build_ffn(cur,
  13770. model.layers[il].ffn_up_shexp, NULL, NULL,
  13771. model.layers[il].ffn_gate_shexp, NULL, NULL,
  13772. model.layers[il].ffn_down_shexp, NULL, NULL,
  13773. NULL,
  13774. LLM_FFN_SILU, LLM_FFN_PAR, il);
  13775. cb(ffn_shexp, "ffn_shexp", il);
  13776. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  13777. } else {
  13778. cur = moe_out;
  13779. }
  13780. cb(cur, "ffn_out", il);
  13781. }
  13782. cur = ggml_add(ctx0, cur, ffn_inp);
  13783. cb(cur, "ffn_out", il);
  13784. cur = build_cvec(cur, il);
  13785. cb(cur, "l_out", il);
  13786. // input for next layer
  13787. inpL = cur;
  13788. }
  13789. cur = inpL;
  13790. cur = build_norm(cur,
  13791. model.output_norm, NULL,
  13792. LLM_NORM_RMS, -1);
  13793. cb(cur, "result_norm", -1);
  13794. res->t_embd = cur;
  13795. // lm_head
  13796. cur = build_lora_mm(model.output, cur);
  13797. cb(cur, "result_output", -1);
  13798. res->t_logits = cur;
  13799. ggml_build_forward_expand(gf, cur);
  13800. }
  13801. };
  13802. struct llm_build_falcon_h1 : public llm_graph_context_mamba {
  13803. llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
  13804. const int64_t n_embd_head = hparams.n_embd_head_v;
  13805. ggml_tensor * cur;
  13806. ggml_tensor * inpL;
  13807. inpL = build_inp_embd(model.tok_embd);
  13808. // inp_pos - contains the positions
  13809. ggml_tensor * inp_pos = build_inp_pos();
  13810. // Build the inputs in the recurrent & kv cache
  13811. auto * inp = build_inp_mem_hybrid();
  13812. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  13813. ggml_tensor * inp_out_ids = build_inp_out_ids();
  13814. for (int il = 0; il < n_layer; ++il) {
  13815. ggml_tensor * inpSA = inpL;
  13816. cur = build_norm(inpL,
  13817. model.layers[il].attn_norm, NULL,
  13818. LLM_NORM_RMS, il);
  13819. cb(cur, "attn_norm", il);
  13820. // self-attention
  13821. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  13822. cb(Qcur, "Qcur", il);
  13823. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  13824. cb(Kcur, "Kcur", il);
  13825. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  13826. cb(Vcur, "Vcur", il);
  13827. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  13828. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  13829. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  13830. Qcur = ggml_rope_ext(
  13831. ctx0, Qcur, inp_pos, nullptr,
  13832. n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
  13833. ext_factor, attn_factor, beta_fast, beta_slow);
  13834. Kcur = ggml_rope_ext(
  13835. ctx0, Kcur, inp_pos, nullptr,
  13836. n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
  13837. ext_factor, attn_factor, beta_fast, beta_slow
  13838. );
  13839. cb(Qcur, "Qcur-post-rope", il);
  13840. cb(Kcur, "Kcur-post-rope", il);
  13841. cb(Vcur, "Vcur-post-rope", il);
  13842. ggml_tensor * attn_out = build_attn(inp->get_attn(),
  13843. model.layers[il].wo, NULL,
  13844. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  13845. cb(attn_out, "attn_out", il);
  13846. cur = build_norm(inpL,
  13847. model.layers[il].attn_norm, NULL,
  13848. LLM_NORM_RMS, il);
  13849. // Mamba2 layer
  13850. cb(cur, "ssm_in", il);
  13851. ggml_tensor * ssm_out = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
  13852. cb(ssm_out, "ssm_out", il);
  13853. // // Aggregation
  13854. cur = ggml_add(ctx0, attn_out, ssm_out);
  13855. inpSA = ggml_add(ctx0, cur, inpSA);
  13856. cb(cur, "layer_out", il);
  13857. if (il == n_layer - 1 && inp_out_ids) {
  13858. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  13859. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  13860. }
  13861. ggml_tensor * ffn_inp = inpSA;
  13862. cb(ffn_inp, "ffn_inp", il);
  13863. // feed-forward network
  13864. cur = build_norm(ffn_inp,
  13865. model.layers[il].ffn_norm, NULL,
  13866. LLM_NORM_RMS, il);
  13867. cb(cur, "ffn_norm", il);
  13868. cur = build_ffn(cur,
  13869. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  13870. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  13871. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  13872. NULL,
  13873. LLM_FFN_SILU, LLM_FFN_PAR, il);
  13874. cb(cur, "ffn_out", il);
  13875. cur = ggml_add(ctx0, cur, inpSA);
  13876. cur = build_cvec(cur, il);
  13877. cb(cur, "l_out", il);
  13878. // input for next layer
  13879. inpL = cur;
  13880. }
  13881. cur = inpL;
  13882. cur = build_norm(cur,
  13883. model.output_norm, NULL,
  13884. LLM_NORM_RMS, -1);
  13885. cb(cur, "result_norm", -1);
  13886. res->t_embd = cur;
  13887. // lm_head
  13888. cur = build_lora_mm(model.output, cur);
  13889. cb(cur, "result_output", -1);
  13890. res->t_logits = cur;
  13891. ggml_build_forward_expand(gf, cur);
  13892. }
  13893. };
  13894. struct llm_build_plamo2 : public llm_graph_context_mamba {
  13895. llm_build_plamo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
  13896. ggml_tensor * cur;
  13897. ggml_tensor * inpL;
  13898. // {n_embd, n_tokens}
  13899. inpL = build_inp_embd(model.tok_embd);
  13900. cb(inpL, "embedding_output", -1);
  13901. ggml_tensor * inp_pos = build_inp_pos();
  13902. auto * inp_hybrid = build_inp_mem_hybrid();
  13903. ggml_tensor * inp_out_ids = build_inp_out_ids();
  13904. for (int il = 0; il < n_layer; ++il) {
  13905. ggml_tensor * residual = inpL;
  13906. // ggml_graph_add_node(gf, model.layers[il].attn_norm);
  13907. // cb(model.layers[il].attn_norm, "attn_norm", il);
  13908. // pre_mixer_norm
  13909. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  13910. // check if this layer is Mamba or Attention
  13911. bool is_mamba_layer = hparams.is_recurrent(il);
  13912. if (is_mamba_layer) {
  13913. // PLaMo-2 Mamba layer
  13914. cur = build_plamo2_mamba_layer(inp_hybrid->get_recr(), cur, model, ubatch, il);
  13915. } else {
  13916. // PLaMo-2 Attention layer
  13917. cur = build_plamo2_attn_layer(inp_hybrid->get_attn(), inp_pos, cur, model, il);
  13918. }
  13919. // post_mixer_norm
  13920. cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
  13921. cb(cur, "attn_post_norm", il);
  13922. // residual connection
  13923. cur = ggml_add(ctx0, cur, residual);
  13924. cb(cur, "attn_residual", il);
  13925. residual = cur;
  13926. // pre-ffn norm
  13927. cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
  13928. cb(cur, "ffn_pre_norm", il);
  13929. // feed-forward network
  13930. cur = build_ffn(cur,
  13931. model.layers[il].ffn_up, NULL, NULL,
  13932. NULL, NULL, NULL,
  13933. model.layers[il].ffn_down, NULL, NULL,
  13934. NULL,
  13935. LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
  13936. cb(cur, "ffn_out", il);
  13937. // post ffn norm
  13938. cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
  13939. cb(cur, "ffn_post_norm", il);
  13940. if (il == n_layer - 1 && inp_out_ids) {
  13941. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  13942. residual = ggml_get_rows(ctx0, residual, inp_out_ids);
  13943. }
  13944. // residual connection
  13945. cur = ggml_add(ctx0, cur, residual);
  13946. cb(cur, "ffn_residual", il);
  13947. inpL = cur;
  13948. }
  13949. cur = inpL;
  13950. // final norm
  13951. cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
  13952. cb(cur, "result_norm", -1);
  13953. // lm_head
  13954. cur = build_lora_mm(model.output, cur);
  13955. cb(cur, "result_output", -1);
  13956. // Explicitly mark as output tensor to ensure proper backend assignment
  13957. ggml_set_output(cur);
  13958. res->t_logits = cur;
  13959. ggml_build_forward_expand(gf, cur);
  13960. }
  13961. private:
  13962. ggml_tensor * build_plamo2_attn_layer(
  13963. llm_graph_input_attn_kv * inp,
  13964. ggml_tensor * inp_pos,
  13965. ggml_tensor * cur,
  13966. const llama_model & model,
  13967. int il) {
  13968. // self-attention
  13969. {
  13970. // PLaMo-2 uses combined QKV tensor
  13971. ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
  13972. cb(qkv, "wqkv", il);
  13973. // split QKV tensor into Q, K, V
  13974. const int64_t n_embd_head_q = hparams.n_embd_head_k;
  13975. const int64_t n_embd_head_k = hparams.n_embd_head_k;
  13976. const int64_t n_embd_head_v = hparams.n_embd_head_v;
  13977. int32_t n_head_kv = hparams.n_head_kv(il);
  13978. const int64_t q_offset = 0;
  13979. const int64_t k_offset = n_embd_head_q * n_head;
  13980. const int64_t v_offset = k_offset + n_embd_head_k * n_head_kv;
  13981. ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
  13982. ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
  13983. ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, n_embd_head_v, n_head_kv, n_tokens, n_embd_head_v * sizeof(float), qkv->nb[1], v_offset * ggml_element_size(qkv));
  13984. cb(Qcur, "Qcur", il);
  13985. cb(Kcur, "Kcur", il);
  13986. cb(Vcur, "Vcur", il);
  13987. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  13988. cb(Qcur, "Qcur_normed", il);
  13989. Qcur = ggml_rope_ext(
  13990. ctx0, Qcur, inp_pos, nullptr,
  13991. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13992. ext_factor, attn_factor, beta_fast, beta_slow
  13993. );
  13994. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  13995. cb(Kcur, "Kcur_normed", il);
  13996. Kcur = ggml_rope_ext(
  13997. ctx0, Kcur, inp_pos, nullptr,
  13998. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13999. ext_factor, attn_factor, beta_fast, beta_slow
  14000. );
  14001. cur = build_attn(inp,
  14002. model.layers[il].wo, NULL,
  14003. Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
  14004. }
  14005. cb(cur, "attn_out", il);
  14006. return cur;
  14007. }
  14008. ggml_tensor * build_plamo2_mamba_layer(
  14009. llm_graph_input_rs * inp,
  14010. ggml_tensor * cur,
  14011. const llama_model & model,
  14012. const llama_ubatch & ubatch,
  14013. int il) {
  14014. const auto * mctx_cur = inp->mctx;
  14015. const auto kv_head = mctx_cur->get_head();
  14016. const int64_t d_conv = hparams.ssm_d_conv;
  14017. const int64_t d_inner = hparams.ssm_d_inner;
  14018. const int64_t d_state = hparams.ssm_d_state;
  14019. const int64_t n_heads = hparams.ssm_dt_rank;
  14020. const int64_t head_dim = d_inner / n_heads;
  14021. const int64_t n_group = hparams.ssm_n_group;
  14022. const int64_t n_seqs = ubatch.n_seqs;
  14023. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  14024. GGML_ASSERT(n_seqs != 0);
  14025. GGML_ASSERT(ubatch.equal_seqs());
  14026. GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
  14027. ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
  14028. ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
  14029. ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
  14030. conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
  14031. // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
  14032. cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
  14033. // in_proj: {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
  14034. ggml_tensor * zx = build_lora_mm(model.layers[il].ssm_in, cur);
  14035. cb(zx, "mamba_in_proj", il);
  14036. // {8192, 5, 1, 1} -> {8192, 1, 5, 1}
  14037. zx = ggml_permute(ctx0, zx, 0, 2, 1, 3);
  14038. zx = ggml_cont_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
  14039. cb(zx, "mamba_in_proj_out", il);
  14040. // split into z and x
  14041. // => {head_dim * n_heads, n_seq_tokens, n_seqs}
  14042. ggml_tensor * x = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], head_dim*ggml_element_size(zx));
  14043. x = ggml_cont_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
  14044. // x = ggml_permute(ctx0, x, 0, 2, 1, 3);
  14045. cb(x, "mamba_x_split", il);
  14046. ggml_tensor * z = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], 0);
  14047. cb(z, "mamba_z_split", il);
  14048. // conv1d
  14049. {
  14050. // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
  14051. ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
  14052. cb(conv_x, "mamba_conv1d_input", il);
  14053. // copy last (d_conv - 1) columns back into the state cache
  14054. ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs,
  14055. conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
  14056. ggml_build_forward_expand(gf,
  14057. ggml_cpy(ctx0, last_conv,
  14058. ggml_view_1d(ctx0, conv_states_all,
  14059. (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
  14060. kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
  14061. cb(conv_states_all, "mamba_conv1d_state", il);
  14062. // 1D convolution
  14063. x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
  14064. cb(x, "mamba_conv1d", il);
  14065. x = ggml_silu(ctx0, x);
  14066. cb(x, "mamba_conv1d_silu", il);
  14067. }
  14068. // SSM
  14069. {
  14070. // bcdt_proj: {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
  14071. ggml_tensor * x_bcdt = build_lora_mm(model.layers[il].ssm_x, x);
  14072. cb(x_bcdt, "mamba_bcdt_proj", il);
  14073. // split into dt, B, C
  14074. const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
  14075. ggml_tensor * B = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], 0);
  14076. ggml_tensor * C = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], ggml_element_size(x_bcdt)*d_state);
  14077. ggml_tensor * dt = ggml_view_3d(ctx0, x_bcdt, dt_dim, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], ggml_element_size(x_bcdt)*(2*d_state));
  14078. cb(B, "mamba_B_raw", il);
  14079. cb(C, "mamba_C_raw", il);
  14080. cb(dt, "mamba_dt_raw", il);
  14081. // Apply RMS norm to dt, B, C (PLaMo-2 specific)
  14082. B = build_norm(B, model.layers[il].ssm_b_norm, NULL, LLM_NORM_RMS, il);
  14083. C = build_norm(C, model.layers[il].ssm_c_norm, NULL, LLM_NORM_RMS, il);
  14084. dt = build_norm(dt, model.layers[il].ssm_dt_norm, NULL, LLM_NORM_RMS, il);
  14085. cb(B, "mamba_B_normed", il);
  14086. cb(C, "mamba_C_normed", il);
  14087. cb(dt, "mamba_dt_normed", il);
  14088. // dt_proj: {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
  14089. dt = build_lora_mm(model.layers[il].ssm_dt, dt);
  14090. dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
  14091. cb(dt, "mamba_dt_proj", il);
  14092. ggml_tensor * A = ggml_reshape_2d(ctx0, model.layers[il].ssm_a, 1, n_heads);
  14093. cb(A, "mamba_A", il);
  14094. x = ggml_view_4d(ctx0, x, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
  14095. B = ggml_view_4d(ctx0, B, d_state, 1, n_seq_tokens, n_seqs, d_state * B->nb[0], B->nb[1], B->nb[2], 0);
  14096. C = ggml_view_4d(ctx0, C, d_state, 1, n_seq_tokens, n_seqs, d_state * C->nb[0], C->nb[1], C->nb[2], 0);
  14097. // use the states and the indices provided by build_recurrent_state
  14098. // (this is necessary in order to properly use the states before they are overwritten,
  14099. // while avoiding to make unnecessary copies of the states)
  14100. auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
  14101. ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_heads, mctx_cur->get_size());
  14102. // Custom operator to optimize the parallel associative scan
  14103. // as described in the Annex D of the Mamba paper.
  14104. // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
  14105. return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
  14106. };
  14107. ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
  14108. cb(y_ssm, "mamba_ssm_scan", il);
  14109. // store last states
  14110. ggml_build_forward_expand(gf,
  14111. ggml_cpy(ctx0,
  14112. ggml_view_1d(ctx0, y_ssm, n_heads*head_dim*d_state*n_seqs, n_heads*head_dim*n_seq_tokens*n_seqs*ggml_element_size(y_ssm)),
  14113. ggml_view_1d(ctx0, ssm_states_all, n_heads*head_dim*d_state*n_seqs, kv_head*n_seqs*n_heads*head_dim*d_state*ggml_element_size(ssm_states_all))));
  14114. cb(ssm_states_all, "mamba_ssm_states", il);
  14115. ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
  14116. cb(y, "mamba_y_view", il);
  14117. // Add D parameter and apply gating with z
  14118. // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
  14119. ggml_tensor * D = ggml_reshape_2d(ctx0, model.layers[il].ssm_d, 1, n_heads);
  14120. y = ggml_add(ctx0, y, ggml_mul(ctx0, x, D));
  14121. cb(y, "mamba_y_add_d", il);
  14122. y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
  14123. cb(y, "mamba_y_swiglu_z", il);
  14124. // out_proj: {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
  14125. y = ggml_view_3d(ctx0, y, head_dim * n_heads, n_seq_tokens, n_seqs, y->nb[2], y->nb[3], 0);
  14126. cur = build_lora_mm(model.layers[il].ssm_out, y);
  14127. cb(cur, "mamba_out_proj", il);
  14128. }
  14129. // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
  14130. cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
  14131. cb(cur, "mamba_out", il);
  14132. return cur;
  14133. }
  14134. };
  14135. struct llm_build_arcee : public llm_graph_context {
  14136. llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  14137. const int64_t n_embd_head = hparams.n_embd_head_v;
  14138. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  14139. GGML_ASSERT(n_embd_head == hparams.n_rot);
  14140. ggml_tensor * cur;
  14141. ggml_tensor * inpL;
  14142. inpL = build_inp_embd(model.tok_embd);
  14143. // inp_pos - contains the positions
  14144. ggml_tensor * inp_pos = build_inp_pos();
  14145. auto * inp_attn = build_attn_inp_kv();
  14146. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  14147. ggml_tensor * inp_out_ids = build_inp_out_ids();
  14148. for (int il = 0; il < n_layer; ++il) {
  14149. ggml_tensor * inpSA = inpL;
  14150. // norm
  14151. cur = build_norm(inpL,
  14152. model.layers[il].attn_norm, NULL,
  14153. LLM_NORM_RMS, il);
  14154. cb(cur, "attn_norm", il);
  14155. // self-attention
  14156. {
  14157. // rope freq factors for llama3; may return nullptr for llama2 and other models
  14158. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  14159. // compute Q and K and RoPE them
  14160. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  14161. cb(Qcur, "Qcur", il);
  14162. if (model.layers[il].bq) {
  14163. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  14164. cb(Qcur, "Qcur", il);
  14165. }
  14166. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  14167. cb(Kcur, "Kcur", il);
  14168. if (model.layers[il].bk) {
  14169. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  14170. cb(Kcur, "Kcur", il);
  14171. }
  14172. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  14173. cb(Vcur, "Vcur", il);
  14174. if (model.layers[il].bv) {
  14175. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  14176. cb(Vcur, "Vcur", il);
  14177. }
  14178. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  14179. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  14180. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  14181. Qcur = ggml_rope_ext(
  14182. ctx0, Qcur, inp_pos, rope_factors,
  14183. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14184. ext_factor, attn_factor, beta_fast, beta_slow
  14185. );
  14186. Kcur = ggml_rope_ext(
  14187. ctx0, Kcur, inp_pos, rope_factors,
  14188. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14189. ext_factor, attn_factor, beta_fast, beta_slow
  14190. );
  14191. cb(Qcur, "Qcur", il);
  14192. cb(Kcur, "Kcur", il);
  14193. cb(Vcur, "Vcur", il);
  14194. cur = build_attn(inp_attn,
  14195. model.layers[il].wo, model.layers[il].bo,
  14196. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  14197. cb(cur, "attn_out", il);
  14198. }
  14199. if (il == n_layer - 1 && inp_out_ids) {
  14200. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  14201. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  14202. }
  14203. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  14204. cb(ffn_inp, "ffn_inp", il);
  14205. // feed-forward network
  14206. // ARCEE uses relu^2 instead of silu
  14207. cur = build_norm(ffn_inp,
  14208. model.layers[il].ffn_norm, NULL,
  14209. LLM_NORM_RMS, il);
  14210. cb(cur, "ffn_norm", il);
  14211. cur = build_ffn(cur,
  14212. model.layers[il].ffn_up, NULL, NULL,
  14213. NULL, NULL, NULL,
  14214. model.layers[il].ffn_down, NULL, NULL,
  14215. NULL,
  14216. LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
  14217. cb(cur, "ffn_out", il);
  14218. cur = ggml_add(ctx0, cur, ffn_inp);
  14219. cb(cur, "ffn_out", il);
  14220. cur = build_cvec(cur, il);
  14221. cb(cur, "l_out", il);
  14222. // input for next layer
  14223. inpL = cur;
  14224. }
  14225. cur = inpL;
  14226. cur = build_norm(cur,
  14227. model.output_norm, NULL,
  14228. LLM_NORM_RMS, -1);
  14229. cb(cur, "result_norm", -1);
  14230. res->t_embd = cur;
  14231. // lm_head
  14232. cur = build_lora_mm(model.output, cur);
  14233. cb(cur, "result_output", -1);
  14234. res->t_logits = cur;
  14235. ggml_build_forward_expand(gf, cur);
  14236. }
  14237. };
  14238. struct llm_build_hunyuan_moe : public llm_graph_context {
  14239. llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  14240. const int64_t n_embd_head = hparams.n_embd_head_v;
  14241. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  14242. GGML_ASSERT(n_embd_head == hparams.n_rot);
  14243. ggml_tensor * cur;
  14244. ggml_tensor * inpL;
  14245. inpL = build_inp_embd(model.tok_embd);
  14246. // inp_pos - contains the positions
  14247. ggml_tensor * inp_pos = build_inp_pos();
  14248. auto * inp_attn = build_attn_inp_kv();
  14249. const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
  14250. ggml_tensor * inp_out_ids = build_inp_out_ids();
  14251. for (int il = 0; il < n_layer; ++il) {
  14252. ggml_tensor * inpSA = inpL;
  14253. // norm
  14254. cur = build_norm(inpL,
  14255. model.layers[il].attn_norm, NULL,
  14256. LLM_NORM_RMS, il);
  14257. cb(cur, "attn_norm", il);
  14258. // self-attention
  14259. {
  14260. // rope freq factors for llama3; may return nullptr for llama2 and other models
  14261. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  14262. // compute Q and K and RoPE them
  14263. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  14264. cb(Qcur, "Qcur", il);
  14265. if (model.layers[il].bq) {
  14266. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  14267. cb(Qcur, "Qcur", il);
  14268. }
  14269. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  14270. cb(Kcur, "Kcur", il);
  14271. if (model.layers[il].bk) {
  14272. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  14273. cb(Kcur, "Kcur", il);
  14274. }
  14275. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  14276. cb(Vcur, "Vcur", il);
  14277. if (model.layers[il].bv) {
  14278. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  14279. cb(Vcur, "Vcur", il);
  14280. }
  14281. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  14282. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  14283. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  14284. Qcur = ggml_rope_ext(
  14285. ctx0, Qcur, inp_pos, rope_factors,
  14286. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14287. ext_factor, attn_factor, beta_fast, beta_slow
  14288. );
  14289. cb(Qcur, "Qcur", il);
  14290. cb(Kcur, "Kcur", il);
  14291. cb(Vcur, "Vcur", il);
  14292. Kcur = ggml_rope_ext(
  14293. ctx0, Kcur, inp_pos, rope_factors,
  14294. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14295. ext_factor, attn_factor, beta_fast, beta_slow
  14296. );
  14297. Kcur = build_norm(Kcur,
  14298. model.layers[il].attn_k_norm, nullptr,
  14299. LLM_NORM_RMS, il);
  14300. cb(Kcur, "Kcur_norm", il);
  14301. Qcur = build_norm(Qcur,
  14302. model.layers[il].attn_q_norm, nullptr,
  14303. LLM_NORM_RMS, il);
  14304. cb(Qcur, "Qcur_norm", il);
  14305. cur = build_attn(inp_attn,
  14306. model.layers[il].wo, model.layers[il].bo,
  14307. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  14308. cb(cur, "attn_out", il);
  14309. }
  14310. if (il == n_layer - 1 && inp_out_ids) {
  14311. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  14312. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  14313. }
  14314. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  14315. cb(ffn_inp, "ffn_inp", il);
  14316. cur = build_norm(ffn_inp,
  14317. model.layers[il].ffn_norm, NULL,
  14318. LLM_NORM_RMS, il);
  14319. cb(cur, "ffn_norm", il);
  14320. // feed-forward network (non-MoE)
  14321. ggml_tensor * cur_mlp = build_ffn(cur,
  14322. model.layers[il].ffn_up_shexp, NULL, NULL,
  14323. model.layers[il].ffn_gate_shexp, NULL, NULL,
  14324. model.layers[il].ffn_down_shexp, NULL, NULL,
  14325. NULL,
  14326. LLM_FFN_SILU, LLM_FFN_PAR, il);
  14327. cb(cur_mlp, "ffn_mlp", il);
  14328. // MoE branch
  14329. ggml_tensor * cur_moe = build_moe_ffn(cur,
  14330. model.layers[il].ffn_gate_inp,
  14331. model.layers[il].ffn_up_exps,
  14332. model.layers[il].ffn_gate_exps,
  14333. model.layers[il].ffn_down_exps,
  14334. nullptr,
  14335. n_expert, n_expert_used,
  14336. LLM_FFN_SILU,
  14337. true, // norm_topk_prob
  14338. false,
  14339. 0.0,
  14340. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  14341. il);
  14342. cb(cur_moe, "ffn_moe_out", il);
  14343. ggml_tensor * ffn_out = ggml_add(ctx0, cur_moe, cur_mlp);
  14344. cb(ffn_out, "ffn_out", il);
  14345. cur = ggml_add(ctx0, ffn_out, ffn_inp);
  14346. cur = build_cvec(cur, il);
  14347. cb(cur, "l_out", il);
  14348. // input for next layer
  14349. inpL = cur;
  14350. }
  14351. cur = inpL;
  14352. cur = build_norm(cur,
  14353. model.output_norm, NULL,
  14354. LLM_NORM_RMS, -1);
  14355. cb(cur, "result_norm", -1);
  14356. res->t_embd = cur;
  14357. // lm_head
  14358. cur = build_lora_mm(model.output, cur);
  14359. cb(cur, "result_output", -1);
  14360. res->t_logits = cur;
  14361. ggml_build_forward_expand(gf, cur);
  14362. }
  14363. };
  14364. struct llm_build_hunyuan_dense : public llm_graph_context {
  14365. llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  14366. const int64_t n_embd_head = hparams.n_embd_head_v;
  14367. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  14368. GGML_ASSERT(n_embd_head == hparams.n_rot);
  14369. ggml_tensor * cur;
  14370. ggml_tensor * inpL;
  14371. inpL = build_inp_embd(model.tok_embd);
  14372. // inp_pos - contains the positions
  14373. ggml_tensor * inp_pos = build_inp_pos();
  14374. auto * inp_attn = build_attn_inp_kv();
  14375. const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
  14376. ggml_tensor * inp_out_ids = build_inp_out_ids();
  14377. for (int il = 0; il < n_layer; ++il) {
  14378. ggml_tensor * inpSA = inpL;
  14379. // norm
  14380. cur = build_norm(inpL,
  14381. model.layers[il].attn_norm, NULL,
  14382. LLM_NORM_RMS, il);
  14383. cb(cur, "attn_norm", il);
  14384. // self-attention
  14385. {
  14386. // rope freq factors for llama3; may return nullptr for llama2 and other models
  14387. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  14388. // compute Q and K and RoPE them
  14389. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  14390. cb(Qcur, "Qcur", il);
  14391. if (model.layers[il].bq) {
  14392. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  14393. cb(Qcur, "Qcur", il);
  14394. }
  14395. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  14396. cb(Kcur, "Kcur", il);
  14397. if (model.layers[il].bk) {
  14398. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  14399. cb(Kcur, "Kcur", il);
  14400. }
  14401. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  14402. cb(Vcur, "Vcur", il);
  14403. if (model.layers[il].bv) {
  14404. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  14405. cb(Vcur, "Vcur", il);
  14406. }
  14407. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  14408. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  14409. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  14410. Qcur = ggml_rope_ext(
  14411. ctx0, Qcur, inp_pos, rope_factors,
  14412. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14413. ext_factor, attn_factor, beta_fast, beta_slow
  14414. );
  14415. cb(Qcur, "Qcur", il);
  14416. cb(Kcur, "Kcur", il);
  14417. cb(Vcur, "Vcur", il);
  14418. Kcur = ggml_rope_ext(
  14419. ctx0, Kcur, inp_pos, rope_factors,
  14420. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14421. ext_factor, attn_factor, beta_fast, beta_slow
  14422. );
  14423. Kcur = build_norm(Kcur,
  14424. model.layers[il].attn_k_norm, nullptr,
  14425. LLM_NORM_RMS, il);
  14426. cb(Kcur, "Kcur_norm", il);
  14427. Qcur = build_norm(Qcur,
  14428. model.layers[il].attn_q_norm, nullptr,
  14429. LLM_NORM_RMS, il);
  14430. cb(Qcur, "Qcur_norm", il);
  14431. cur = build_attn(inp_attn,
  14432. model.layers[il].wo, model.layers[il].bo,
  14433. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  14434. cb(cur, "attn_out", il);
  14435. }
  14436. if (il == n_layer - 1 && inp_out_ids) {
  14437. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  14438. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  14439. }
  14440. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  14441. cb(ffn_inp, "ffn_inp", il);
  14442. cur = build_norm(ffn_inp,
  14443. model.layers[il].ffn_norm, NULL,
  14444. LLM_NORM_RMS, il);
  14445. cb(cur, "ffn_norm", il);
  14446. // feed-forward network (non-MoE)
  14447. ggml_tensor * cur_mlp = build_ffn(cur,
  14448. model.layers[il].ffn_up, NULL, NULL,
  14449. model.layers[il].ffn_gate, NULL, NULL,
  14450. model.layers[il].ffn_down, NULL, NULL,
  14451. NULL,
  14452. LLM_FFN_SILU, LLM_FFN_PAR, il);
  14453. cb(cur_mlp, "ffn_out", il);
  14454. cur = ggml_add(ctx0, cur_mlp, ffn_inp);
  14455. cur = build_cvec(cur, il);
  14456. cb(cur, "l_out", il);
  14457. // input for next layer
  14458. inpL = cur;
  14459. }
  14460. cur = inpL;
  14461. cur = build_norm(cur,
  14462. model.output_norm, NULL,
  14463. LLM_NORM_RMS, -1);
  14464. cb(cur, "result_norm", -1);
  14465. res->t_embd = cur;
  14466. // lm_head
  14467. cur = build_lora_mm(model.output, cur);
  14468. cb(cur, "result_output", -1);
  14469. res->t_logits = cur;
  14470. ggml_build_forward_expand(gf, cur);
  14471. }
  14472. };
  14473. struct llm_build_smollm3 : public llm_graph_context {
  14474. llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  14475. const int64_t n_embd_head = hparams.n_embd_head_v;
  14476. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  14477. GGML_ASSERT(n_embd_head == hparams.n_rot);
  14478. ggml_tensor * cur;
  14479. ggml_tensor * inpL;
  14480. inpL = build_inp_embd(model.tok_embd);
  14481. // inp_pos - contains the positions
  14482. ggml_tensor * inp_pos = build_inp_pos();
  14483. auto * inp_attn = build_attn_inp_kv();
  14484. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  14485. ggml_tensor * inp_out_ids = build_inp_out_ids();
  14486. for (int il = 0; il < n_layer; ++il) {
  14487. ggml_tensor * inpSA = inpL;
  14488. const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
  14489. // norm
  14490. cur = build_norm(inpL,
  14491. model.layers[il].attn_norm, NULL,
  14492. LLM_NORM_RMS, il);
  14493. cb(cur, "attn_norm", il);
  14494. // self-attention
  14495. {
  14496. // compute Q and K and RoPE them
  14497. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  14498. cb(Qcur, "Qcur", il);
  14499. if (model.layers[il].bq) {
  14500. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  14501. cb(Qcur, "Qcur", il);
  14502. }
  14503. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  14504. cb(Kcur, "Kcur", il);
  14505. if (model.layers[il].bk) {
  14506. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  14507. cb(Kcur, "Kcur", il);
  14508. }
  14509. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  14510. cb(Vcur, "Vcur", il);
  14511. if (model.layers[il].bv) {
  14512. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  14513. cb(Vcur, "Vcur", il);
  14514. }
  14515. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  14516. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  14517. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  14518. if (use_rope) {
  14519. Qcur = ggml_rope_ext(
  14520. ctx0, Qcur, inp_pos, nullptr,
  14521. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14522. ext_factor, attn_factor, beta_fast, beta_slow
  14523. );
  14524. Kcur = ggml_rope_ext(
  14525. ctx0, Kcur, inp_pos, nullptr,
  14526. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14527. ext_factor, attn_factor, beta_fast, beta_slow
  14528. );
  14529. }
  14530. cb(Qcur, "Qcur", il);
  14531. cb(Kcur, "Kcur", il);
  14532. cb(Vcur, "Vcur", il);
  14533. cur = build_attn(inp_attn,
  14534. model.layers[il].wo, model.layers[il].bo,
  14535. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  14536. cb(cur, "attn_out", il);
  14537. }
  14538. if (il == n_layer - 1 && inp_out_ids) {
  14539. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  14540. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  14541. }
  14542. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  14543. cb(ffn_inp, "ffn_inp", il);
  14544. // feed-forward network
  14545. {
  14546. cur = build_norm(ffn_inp,
  14547. model.layers[il].ffn_norm, NULL,
  14548. LLM_NORM_RMS, il);
  14549. cb(cur, "ffn_norm", il);
  14550. cur = build_ffn(cur,
  14551. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  14552. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  14553. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  14554. NULL,
  14555. LLM_FFN_SILU, LLM_FFN_PAR, il);
  14556. cb(cur, "ffn_out", il);
  14557. }
  14558. cur = ggml_add(ctx0, cur, ffn_inp);
  14559. cb(cur, "ffn_out", il);
  14560. cur = build_cvec(cur, il);
  14561. cb(cur, "l_out", il);
  14562. // input for next layer
  14563. inpL = cur;
  14564. }
  14565. cur = inpL;
  14566. cur = build_norm(cur,
  14567. model.output_norm, NULL,
  14568. LLM_NORM_RMS, -1);
  14569. cb(cur, "result_norm", -1);
  14570. res->t_embd = cur;
  14571. // lm_head
  14572. cur = build_lora_mm(model.output, cur);
  14573. cb(cur, "result_output", -1);
  14574. res->t_logits = cur;
  14575. ggml_build_forward_expand(gf, cur);
  14576. }
  14577. };
  14578. struct llm_build_openai_moe_iswa : public llm_graph_context {
  14579. llm_build_openai_moe_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  14580. ggml_tensor * cur;
  14581. ggml_tensor * inpL;
  14582. inpL = build_inp_embd(model.tok_embd);
  14583. // inp_pos - contains the positions
  14584. ggml_tensor * inp_pos = build_inp_pos();
  14585. auto * inp_attn = build_attn_inp_kv_iswa();
  14586. for (int il = 0; il < n_layer; ++il) {
  14587. ggml_tensor * inpSA = inpL;
  14588. // norm
  14589. cur = build_norm(inpL,
  14590. model.layers[il].attn_norm, nullptr,
  14591. LLM_NORM_RMS, il);
  14592. cb(cur, "attn_norm", il);
  14593. // self-attention
  14594. {
  14595. // compute Q and K and RoPE them
  14596. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  14597. cb(Qcur, "Qcur", il);
  14598. if (model.layers[il].bq) {
  14599. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  14600. cb(Qcur, "Qcur", il);
  14601. }
  14602. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  14603. cb(Kcur, "Kcur", il);
  14604. if (model.layers[il].bk) {
  14605. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  14606. cb(Kcur, "Kcur", il);
  14607. }
  14608. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  14609. cb(Vcur, "Vcur", il);
  14610. if (model.layers[il].bv) {
  14611. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  14612. cb(Vcur, "Vcur", il);
  14613. }
  14614. Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
  14615. Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
  14616. Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
  14617. Qcur = ggml_rope_ext(
  14618. ctx0, Qcur, inp_pos, nullptr,
  14619. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14620. ext_factor, attn_factor, beta_fast, beta_slow
  14621. );
  14622. Kcur = ggml_rope_ext(
  14623. ctx0, Kcur, inp_pos, nullptr,
  14624. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14625. ext_factor, attn_factor, beta_fast, beta_slow
  14626. );
  14627. cb(Qcur, "Qcur", il);
  14628. cb(Kcur, "Kcur", il);
  14629. cb(Vcur, "Vcur", il);
  14630. cur = build_attn(inp_attn,
  14631. model.layers[il].wo, model.layers[il].bo,
  14632. Qcur, Kcur, Vcur, nullptr, model.layers[il].attn_sinks, nullptr, 1.0f/sqrtf(float(n_rot)), il);
  14633. cb(cur, "attn_out", il);
  14634. }
  14635. if (il == n_layer - 1) {
  14636. // skip computing output for unused tokens
  14637. ggml_tensor * inp_out_ids = build_inp_out_ids();
  14638. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  14639. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  14640. }
  14641. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  14642. cb(ffn_inp, "ffn_inp", il);
  14643. cur = ffn_inp;
  14644. cur = build_norm(cur,
  14645. model.layers[il].attn_post_norm, nullptr,
  14646. LLM_NORM_RMS, il);
  14647. cb(cur, "attn_post_norm", il);
  14648. // MoE branch
  14649. cur = build_moe_ffn(cur,
  14650. model.layers[il].ffn_gate_inp, model.layers[il].ffn_gate_inp_b,
  14651. model.layers[il].ffn_up_exps, model.layers[il].ffn_up_exps_b,
  14652. model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b,
  14653. model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b,
  14654. nullptr,
  14655. n_expert, n_expert_used,
  14656. LLM_FFN_SWIGLU_OAI_MOE, false,
  14657. false, 0.0,
  14658. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
  14659. il);
  14660. cb(cur, "ffn_moe_out", il);
  14661. cur = ggml_add(ctx0, cur, ffn_inp);
  14662. cur = build_cvec(cur, il);
  14663. cb(cur, "l_out", il);
  14664. // input for next layer
  14665. inpL = cur;
  14666. }
  14667. cur = inpL;
  14668. cur = build_norm(cur,
  14669. model.output_norm, NULL,
  14670. LLM_NORM_RMS, -1);
  14671. cb(cur, "result_norm", -1);
  14672. res->t_embd = cur;
  14673. // lm_head
  14674. cur = build_lora_mm(model.output, cur);
  14675. cb(cur, "result_output", -1);
  14676. res->t_logits = cur;
  14677. ggml_build_forward_expand(gf, cur);
  14678. }
  14679. };
  14680. struct llm_build_lfm2 : public llm_graph_context {
  14681. const llama_model & model;
  14682. llm_build_lfm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
  14683. ggml_tensor * cur = build_inp_embd(model.tok_embd);
  14684. cb(cur, "model.embed_tokens", -1);
  14685. ggml_tensor * inp_pos = build_inp_pos();
  14686. auto * inp_hybrid = build_inp_mem_hybrid();
  14687. ggml_tensor * inp_out_ids = build_inp_out_ids();
  14688. for (int il = 0; il < n_layer; ++il) {
  14689. auto * prev_cur = cur;
  14690. cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  14691. cb(cur, "model.layers.{}.operator_norm", il);
  14692. cur = hparams.is_recurrent(il) ?
  14693. build_shortconv_block(cur, inp_hybrid->get_recr(), il) :
  14694. build_attn_block(cur, inp_pos, inp_hybrid->get_attn(), il) ;
  14695. if (il == n_layer - 1 && inp_out_ids) {
  14696. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  14697. prev_cur = ggml_get_rows(ctx0, prev_cur, inp_out_ids);
  14698. }
  14699. cur = ggml_add(ctx0, prev_cur, cur);
  14700. cur = ggml_add(ctx0, cur, build_feed_forward(cur, il));
  14701. }
  14702. cur = build_norm(cur, model.tok_norm, NULL, LLM_NORM_RMS, -1);
  14703. cb(cur, "model.embedding_norm", -1);
  14704. res->t_embd = cur;
  14705. cur = build_lora_mm(model.output, cur);
  14706. cb(cur, "lm_head", -1);
  14707. res->t_logits = cur;
  14708. ggml_build_forward_expand(gf, cur);
  14709. }
  14710. ggml_tensor * build_feed_forward(ggml_tensor * cur,
  14711. int il) const {
  14712. cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
  14713. cb(cur, "model.layers.{}.ffn_norm", il);
  14714. GGML_ASSERT(!model.layers[il].ffn_up_b);
  14715. GGML_ASSERT(!model.layers[il].ffn_gate_b);
  14716. GGML_ASSERT(!model.layers[il].ffn_down_b);
  14717. cur = build_ffn(cur,
  14718. model.layers[il].ffn_up, NULL, NULL,
  14719. model.layers[il].ffn_gate, NULL, NULL,
  14720. model.layers[il].ffn_down, NULL, NULL,
  14721. NULL,
  14722. LLM_FFN_SILU, LLM_FFN_PAR, il);
  14723. cb(cur, "model.layers.{}.feed_forward.w2", il);
  14724. return cur;
  14725. }
  14726. ggml_tensor * build_attn_block(ggml_tensor * cur,
  14727. ggml_tensor * inp_pos,
  14728. llm_graph_input_attn_kv * inp_attn,
  14729. int il) const {
  14730. GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
  14731. auto const n_embd_head = hparams.n_embd_head_v;
  14732. auto const n_head_kv = hparams.n_head_kv(il);
  14733. auto * q = build_lora_mm(model.layers[il].wq, cur);
  14734. cb(q, "model.layers.{}.self_attn.q_proj", il);
  14735. auto * k = build_lora_mm(model.layers[il].wk, cur);
  14736. cb(k, "model.layers.{}.self_attn.k_proj", il);
  14737. auto * v = build_lora_mm(model.layers[il].wv, cur);
  14738. cb(v, "model.layers.{}.self_attn.v_proj", il);
  14739. q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens);
  14740. k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head_kv, n_tokens);
  14741. v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens);
  14742. // qk norm
  14743. q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  14744. cb(q, "model.layers.{}.self_attn.q_layernorm", il);
  14745. k = build_norm(k, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  14746. cb(k, "model.layers.{}.self_attn.k_layernorm", il);
  14747. // RoPE
  14748. q = ggml_rope_ext(
  14749. ctx0, q, inp_pos, nullptr,
  14750. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14751. ext_factor, attn_factor, beta_fast, beta_slow
  14752. );
  14753. k = ggml_rope_ext(
  14754. ctx0, k, inp_pos, nullptr,
  14755. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14756. ext_factor, attn_factor, beta_fast, beta_slow
  14757. );
  14758. cur = build_attn(inp_attn, model.layers[il].wo, NULL,
  14759. q, k, v, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  14760. cb(cur, "model.layers.{}.self_attn.out_proj", il);
  14761. return cur;
  14762. }
  14763. ggml_tensor * build_shortconv_block(ggml_tensor * cur,
  14764. llm_graph_input_rs * inp_recr,
  14765. int il) {
  14766. const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
  14767. const uint32_t kv_head = mctx_cur->get_head();
  14768. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  14769. const int64_t n_seqs = ubatch.n_seqs;
  14770. GGML_ASSERT(n_seqs != 0);
  14771. GGML_ASSERT(ubatch.equal_seqs());
  14772. GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
  14773. GGML_ASSERT(hparams.n_shortconv_l_cache > 1);
  14774. const uint32_t d_conv = hparams.n_shortconv_l_cache - 1;
  14775. // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
  14776. cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
  14777. auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur);
  14778. cb(bcx, "model.layers.{}.conv.in_proj", il);
  14779. constexpr auto n_chunks = 3;
  14780. GGML_ASSERT(bcx->ne[0] % n_chunks == 0);
  14781. auto const chunk_size = bcx->ne[0] / n_chunks;
  14782. auto * b = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 0*chunk_size*ggml_element_size(bcx));
  14783. auto * c = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 1*chunk_size*ggml_element_size(bcx));
  14784. auto * x = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 2*chunk_size*ggml_element_size(bcx));
  14785. auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x));
  14786. // read conv state
  14787. auto * conv_state = mctx_cur->get_r_l(il);
  14788. auto * conv_rs = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs);
  14789. auto * conv = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs);
  14790. bx = ggml_concat(ctx0, conv, bx, 0);
  14791. GGML_ASSERT(bx->ne[0] > conv->ne[0]);
  14792. // last d_conv columns is a new conv state
  14793. auto * new_conv = ggml_view_3d(ctx0, bx, conv->ne[0], bx->ne[1], bx->ne[2], bx->nb[1], bx->nb[2], (bx->ne[0] - conv->ne[0])*ggml_element_size(bx));
  14794. GGML_ASSERT(ggml_are_same_shape(conv, new_conv));
  14795. // write new conv conv state
  14796. ggml_build_forward_expand(
  14797. gf,
  14798. ggml_cpy(
  14799. ctx0,
  14800. new_conv,
  14801. ggml_view_1d(
  14802. ctx0,
  14803. conv_state,
  14804. ggml_nelements(new_conv),
  14805. kv_head*d_conv*n_embd*ggml_element_size(new_conv)
  14806. )
  14807. )
  14808. );
  14809. auto * conv_kernel = model.layers[il].shortconv.conv;
  14810. auto * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel);
  14811. cb(conv_out, "model.layers.{}.conv.conv", il);
  14812. auto * y = ggml_mul(ctx0, c, conv_out);
  14813. y = build_lora_mm(model.layers[il].shortconv.out_proj, y);
  14814. cb(y, "model.layers.{}.conv.out_proj", il);
  14815. // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
  14816. y = ggml_reshape_2d(ctx0, y, y->ne[0], n_seq_tokens * n_seqs);
  14817. return y;
  14818. }
  14819. };
  14820. struct llm_build_seed_oss : public llm_graph_context {
  14821. llm_build_seed_oss(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  14822. const int64_t n_embd_head = hparams.n_embd_head_v;
  14823. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  14824. GGML_ASSERT(n_embd_head == hparams.n_rot);
  14825. ggml_tensor * cur;
  14826. ggml_tensor * inpL;
  14827. inpL = build_inp_embd(model.tok_embd);
  14828. // inp_pos - contains the positions
  14829. ggml_tensor * inp_pos = build_inp_pos();
  14830. auto * inp_attn = build_attn_inp_kv();
  14831. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  14832. ggml_tensor * inp_out_ids = build_inp_out_ids();
  14833. for (int il = 0; il < n_layer; ++il) {
  14834. ggml_tensor * inpSA = inpL;
  14835. // norm
  14836. cur = build_norm(inpL,
  14837. model.layers[il].attn_norm, NULL,
  14838. LLM_NORM_RMS, il);
  14839. cb(cur, "attn_norm", il);
  14840. // self-attention
  14841. {
  14842. // compute Q and K and RoPE them
  14843. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  14844. cb(Qcur, "Qcur", il);
  14845. if (model.layers[il].bq) {
  14846. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  14847. cb(Qcur, "Qcur", il);
  14848. }
  14849. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  14850. cb(Kcur, "Kcur", il);
  14851. if (model.layers[il].bk) {
  14852. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  14853. cb(Kcur, "Kcur", il);
  14854. }
  14855. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  14856. cb(Vcur, "Vcur", il);
  14857. if (model.layers[il].bv) {
  14858. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  14859. cb(Vcur, "Vcur", il);
  14860. }
  14861. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  14862. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  14863. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  14864. Qcur = ggml_rope_ext(
  14865. ctx0, Qcur, inp_pos, nullptr,
  14866. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14867. ext_factor, attn_factor, beta_fast, beta_slow
  14868. );
  14869. Kcur = ggml_rope_ext(
  14870. ctx0, Kcur, inp_pos, nullptr,
  14871. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14872. ext_factor, attn_factor, beta_fast, beta_slow
  14873. );
  14874. cb(Qcur, "Qcur", il);
  14875. cb(Kcur, "Kcur", il);
  14876. cb(Vcur, "Vcur", il);
  14877. cur = build_attn(inp_attn,
  14878. model.layers[il].wo, model.layers[il].bo,
  14879. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  14880. cb(cur, "attn_out", il);
  14881. }
  14882. if (il == n_layer - 1 && inp_out_ids) {
  14883. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  14884. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  14885. }
  14886. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  14887. cb(ffn_inp, "ffn_inp", il);
  14888. // feed-forward network
  14889. cur = build_norm(ffn_inp,
  14890. model.layers[il].attn_post_norm, NULL,
  14891. LLM_NORM_RMS, il);
  14892. cb(cur, "attn_post_norm", il);
  14893. cur = build_ffn(cur,
  14894. model.layers[il].ffn_up, NULL, NULL,
  14895. model.layers[il].ffn_gate, NULL, NULL,
  14896. model.layers[il].ffn_down, NULL, NULL,
  14897. NULL,
  14898. LLM_FFN_SILU, LLM_FFN_PAR, il);
  14899. cb(cur, "ffn_out", il);
  14900. cur = ggml_add(ctx0, cur, ffn_inp);
  14901. cb(cur, "ffn_out", il);
  14902. cur = build_cvec(cur, il);
  14903. cb(cur, "l_out", il);
  14904. // input for next layer
  14905. inpL = cur;
  14906. }
  14907. cur = inpL;
  14908. cur = build_norm(cur,
  14909. model.output_norm, NULL,
  14910. LLM_NORM_RMS, -1);
  14911. cb(cur, "result_norm", -1);
  14912. res->t_embd = cur;
  14913. // lm_head
  14914. cur = build_lora_mm(model.output, cur);
  14915. cb(cur, "result_output", -1);
  14916. res->t_logits = cur;
  14917. ggml_build_forward_expand(gf, cur);
  14918. }
  14919. };
  14920. template <bool iswa>
  14921. struct llm_build_smallthinker : public llm_graph_context{
  14922. llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
  14923. const int64_t n_embd_head = hparams.n_embd_head_v;
  14924. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  14925. GGML_ASSERT(n_embd_head == hparams.n_rot);
  14926. ggml_tensor * cur;
  14927. ggml_tensor * inpL;
  14928. inpL = build_inp_embd(model.tok_embd);
  14929. // inp_pos - contains the positions
  14930. ggml_tensor * inp_pos = build_inp_pos();
  14931. using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
  14932. inp_attn_type * inp_attn = nullptr;
  14933. if constexpr (iswa) {
  14934. inp_attn = build_attn_inp_kv_iswa();
  14935. } else {
  14936. inp_attn = build_attn_inp_kv();
  14937. }
  14938. ggml_tensor * inp_out_ids = build_inp_out_ids();
  14939. for (int il = 0; il < n_layer; ++il) {
  14940. ggml_tensor * inpSA = inpL;
  14941. ggml_tensor * probs = nullptr;
  14942. probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
  14943. cb(probs, "ffn_moe_logits", il);
  14944. // norm
  14945. cur = build_norm(inpL,model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  14946. cb(cur, "attn_norm", il);
  14947. // self_attention
  14948. {
  14949. // compute Q and K and RoPE them
  14950. struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  14951. cb(Qcur, "Qcur", il);
  14952. struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  14953. cb(Kcur, "Kcur", il);
  14954. struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  14955. cb(Vcur, "Vcur", il);
  14956. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  14957. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  14958. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  14959. if (hparams.n_no_rope_layer_step == n_layer || il % hparams.n_no_rope_layer_step != 0) {
  14960. Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14961. ext_factor, attn_factor, beta_fast, beta_slow);
  14962. Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14963. ext_factor, attn_factor, beta_fast, beta_slow);
  14964. }
  14965. cb(Qcur, "Qcur", il);
  14966. cb(Kcur, "Kcur", il);
  14967. cur = build_attn(inp_attn,
  14968. model.layers[il].wo, model.layers[il].bo,
  14969. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
  14970. }
  14971. if (il == n_layer - 1 && inp_out_ids) {
  14972. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  14973. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  14974. probs = ggml_get_rows(ctx0, probs, inp_out_ids);
  14975. }
  14976. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  14977. cb(ffn_inp, "ffn_inp", il);
  14978. // MoE branch
  14979. cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
  14980. cb(cur, "ffn_norm", il);
  14981. ggml_tensor * ffn_out =
  14982. build_moe_ffn(cur,
  14983. nullptr,
  14984. model.layers[il].ffn_up_exps,
  14985. model.layers[il].ffn_gate_exps,
  14986. model.layers[il].ffn_down_exps,
  14987. nullptr,
  14988. n_expert, n_expert_used,
  14989. LLM_FFN_RELU, true,
  14990. false, 0.0,
  14991. static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
  14992. il, probs);
  14993. cb(ffn_out, "ffn_out", il);
  14994. cur = ffn_out;
  14995. cur = ggml_add(ctx0, cur, ffn_inp);
  14996. cur = build_cvec(cur, il);
  14997. cb(cur, "l_out", il);
  14998. // input for next layer
  14999. inpL = cur;
  15000. }
  15001. cur = inpL;
  15002. cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
  15003. cb(cur, "result_norm", -1);
  15004. // lm_head
  15005. cur = build_lora_mm(model.output, cur);
  15006. cb(cur, "result_output", -1);
  15007. res->t_logits = cur;
  15008. ggml_build_forward_expand(gf, cur);
  15009. }
  15010. };
  15011. struct llm_build_qwen3next : public llm_graph_context_mamba {
  15012. llm_build_qwen3next(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
  15013. const int64_t n_embd_head = hparams.n_embd_head_v;
  15014. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  15015. ggml_tensor * cur;
  15016. ggml_tensor * inpL;
  15017. inpL = build_inp_embd(model.tok_embd);
  15018. auto * inp = build_inp_mem_hybrid();
  15019. ggml_tensor * inp_pos = build_inp_pos();
  15020. ggml_tensor * inp_out_ids = build_inp_out_ids();
  15021. for (int il = 0; il < n_layer; ++il) {
  15022. struct ggml_tensor * inpSA = inpL;
  15023. // Pre-norm for attention/linear attention
  15024. cur = build_norm(inpL,
  15025. model.layers[il].attn_norm, NULL,
  15026. LLM_NORM_RMS, il);
  15027. cb(cur, "attn_norm", il);
  15028. // Determine layer type and build appropriate attention mechanism
  15029. if (hparams.is_recurrent(il)) {
  15030. // Linear attention layer (gated delta net)
  15031. cur = build_qwen3next_linear_attn_layer(inp->get_recr(), cur, model, ubatch, il);
  15032. } else {
  15033. // Full attention layer
  15034. cur = build_qwen3next_attention_layer(
  15035. cur, inp_pos, inp->get_attn(), model,
  15036. n_embd_head, il);
  15037. }
  15038. // Post-attention norm
  15039. cur = build_norm(cur,
  15040. model.layers[il].attn_post_norm, NULL,
  15041. LLM_NORM_RMS, il);
  15042. cb(cur, "attn_post_norm", il);
  15043. if (il == n_layer - 1 && inp_out_ids) {
  15044. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  15045. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  15046. }
  15047. // Residual connection
  15048. cur = ggml_add(ctx0, cur, inpSA);
  15049. cb(cur, "attn_residual", il);
  15050. // FFN layer (MoE or dense)
  15051. cur = build_layer_ffn(cur, model, il);
  15052. // Input for next layer
  15053. inpL = cur;
  15054. }
  15055. cur = inpL;
  15056. // Final norm
  15057. cur = build_norm(cur,
  15058. model.output_norm, NULL,
  15059. LLM_NORM_RMS, -1);
  15060. cb(cur, "result_norm", -1);
  15061. res->t_embd = cur;
  15062. // LM head
  15063. cur = build_lora_mm(model.output, cur);
  15064. cb(cur, "result_output", -1);
  15065. ggml_set_output(cur);
  15066. res->t_logits = cur;
  15067. ggml_build_forward_expand(gf, cur);
  15068. }
  15069. private:
  15070. ggml_tensor * build_qwen3next_attention_layer(
  15071. ggml_tensor * cur,
  15072. ggml_tensor * inp_pos,
  15073. llm_graph_input_attn_kv * inp_attn,
  15074. const llama_model & model,
  15075. const int64_t n_embd_head,
  15076. const int il) {
  15077. ggml_tensor * gate = build_lora_mm(model.layers[il].wq_gate, cur);
  15078. // compute Q and K and RoPE them
  15079. struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  15080. cb(Qcur, "Qcur", il);
  15081. struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  15082. cb(Kcur, "Kcur", il);
  15083. struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  15084. cb(Vcur, "Vcur", il);
  15085. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  15086. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  15087. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  15088. // Apply Q/K normalization
  15089. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  15090. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  15091. cb(Kcur, "Qcur_normed", il);
  15092. cb(Kcur, "Kcur_normed", il);
  15093. // Apply RoPE
  15094. Qcur = ggml_rope_ext(
  15095. ctx0, Qcur, inp_pos, nullptr,
  15096. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  15097. ext_factor, attn_factor, beta_fast, beta_slow);
  15098. Kcur = ggml_rope_ext(
  15099. ctx0, Kcur, inp_pos, nullptr,
  15100. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  15101. ext_factor, attn_factor, beta_fast, beta_slow);
  15102. cb(Qcur, "Qcur", il);
  15103. cb(Kcur, "Kcur", il);
  15104. cb(Vcur, "Vcur", il);
  15105. // Attention computation
  15106. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  15107. cur = build_attn(inp_attn,
  15108. model.layers[il].wo, nullptr,
  15109. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  15110. // Apply gating
  15111. cur = ggml_cont(ctx0, ggml_mul(ctx0, cur, ggml_sigmoid(ctx0, gate)));
  15112. cb(cur, "attn_gated", il);
  15113. return cur;
  15114. }
  15115. ggml_tensor * build_qwen3next_linear_attn_layer(llm_graph_input_rs * inp,
  15116. ggml_tensor * cur,
  15117. const llama_model & model,
  15118. const llama_ubatch & ubatch,
  15119. int il) {
  15120. // Gated Delta Net implementation using the new ggml_delta_net function
  15121. const auto * mctx_cur = inp->mctx;
  15122. const int64_t d_inner = hparams.ssm_d_inner;
  15123. const int64_t n_heads = hparams.ssm_dt_rank;
  15124. const int64_t head_dim = d_inner / n_heads;
  15125. const int64_t n_seqs = ubatch.n_seqs;
  15126. const int64_t head_k_dim = hparams.ssm_d_state;
  15127. const int64_t head_v_dim = hparams.ssm_d_state;
  15128. const int64_t num_k_heads = hparams.ssm_n_group;
  15129. const int64_t num_v_heads = hparams.ssm_dt_rank;
  15130. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  15131. const int64_t n_tokens = ubatch.n_tokens;
  15132. GGML_ASSERT(n_seqs != 0);
  15133. GGML_ASSERT(ubatch.equal_seqs());
  15134. GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
  15135. // Input projections
  15136. ggml_tensor * mixed_qkvz = build_lora_mm(model.layers[il].ssm_in, cur);
  15137. cb(mixed_qkvz, "linear_attn_mixed_qkvz", il);
  15138. ggml_tensor * mixed_ba = build_lora_mm(model.layers[il].ssm_beta_alpha, cur);
  15139. cb(mixed_ba, "linear_attn_mixed_ba", il);
  15140. // Reshape mixed_qkvz: [batch, seq_len, hidden_size] -> [batch, seq_len, num_k_heads, 2*head_k_dim + 2*head_v_dim*num_v_heads/num_k_heads]
  15141. int64_t qkvz_new_dim = 2 * head_k_dim + 2 * head_v_dim * num_v_heads / num_k_heads;
  15142. ggml_tensor * mixed_qkvz_reshaped =
  15143. ggml_reshape_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_tokens, n_seqs);
  15144. // Reshape mixed_ba: [batch, seq_len, hidden_size] -> [batch, seq_len, num_k_heads, 2*num_v_heads/num_k_heads]
  15145. int64_t ba_new_dim = 2 * num_v_heads / num_k_heads;
  15146. ggml_tensor * mixed_ba_reshaped = ggml_reshape_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_tokens, n_seqs);
  15147. // Split mixed_qkvz into query, key, value, z
  15148. int64_t split_sizes_qkvz[4] = {
  15149. head_k_dim, // query size
  15150. head_k_dim, // key size
  15151. head_v_dim * num_v_heads / num_k_heads, // value size
  15152. head_v_dim * num_v_heads / num_k_heads // z size
  15153. };
  15154. ggml_tensor * query = ggml_cont(ctx0, ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[0], num_k_heads,
  15155. n_tokens, n_seqs, split_sizes_qkvz[0] * sizeof(float),
  15156. mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], 0));
  15157. ggml_tensor * key =
  15158. ggml_cont(ctx0, ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[1], num_k_heads, n_tokens, n_seqs,
  15159. split_sizes_qkvz[1] * sizeof(float), mixed_qkvz_reshaped->nb[1],
  15160. mixed_qkvz_reshaped->nb[2], split_sizes_qkvz[0] * sizeof(float)));
  15161. ggml_tensor * value =
  15162. ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[2], num_k_heads, n_tokens, n_seqs,
  15163. split_sizes_qkvz[2] * sizeof(float), mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2],
  15164. (split_sizes_qkvz[0] + split_sizes_qkvz[1]) * sizeof(float));
  15165. ggml_tensor * z =
  15166. ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[3], num_k_heads, n_tokens, n_seqs,
  15167. split_sizes_qkvz[3] * sizeof(float), mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2],
  15168. (split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * sizeof(float));
  15169. // Reshape value and z to merge head dimensions: [batch, seq_len, num_k_heads, head_v_dim*num_v_heads/num_k_heads] -> [batch, seq_len, num_v_heads, head_v_dim]
  15170. ggml_tensor * value_reshaped =
  15171. ggml_reshape_4d(ctx0, ggml_cont(ctx0, value), head_v_dim, num_v_heads, n_tokens, n_seqs);
  15172. ggml_tensor * z_reshaped = ggml_reshape_4d(ctx0, ggml_cont(ctx0, z), head_v_dim, num_v_heads, n_tokens, n_seqs);
  15173. GGML_ASSERT(ggml_nelements(query) + ggml_nelements(key) + ggml_nelements(value_reshaped) +
  15174. ggml_nelements(z_reshaped) ==
  15175. ggml_nelements(mixed_qkvz));
  15176. // Split mixed_ba into b and a (beta and alpha parameters)
  15177. int64_t split_sizes_ba[2] = {
  15178. num_v_heads / num_k_heads, // beta size
  15179. num_v_heads / num_k_heads // alpha size
  15180. };
  15181. ggml_tensor * b =
  15182. ggml_view_4d(ctx0, mixed_ba_reshaped, split_sizes_ba[0], num_k_heads, n_tokens, n_seqs,
  15183. split_sizes_ba[0] * sizeof(float), mixed_ba_reshaped->nb[1], mixed_ba_reshaped->nb[2], 0);
  15184. ggml_tensor * a = ggml_view_4d(ctx0, mixed_ba_reshaped, split_sizes_ba[1], num_k_heads, n_tokens, n_seqs,
  15185. split_sizes_ba[1] * sizeof(float), mixed_ba_reshaped->nb[1],
  15186. mixed_ba_reshaped->nb[2], split_sizes_ba[0] * sizeof(float));
  15187. // Reshape b and a to merge head dimensions: [batch, seq_len, num_k_heads, num_v_heads/num_k_heads] -> [batch, seq_len, num_v_heads]
  15188. ggml_tensor * beta = ggml_reshape_3d(ctx0, ggml_cont(ctx0, b), num_v_heads, n_tokens, n_seqs);
  15189. ggml_tensor * alpha = ggml_reshape_3d(ctx0, ggml_cont(ctx0, a), num_v_heads, n_tokens, n_seqs);
  15190. GGML_ASSERT(ggml_nelements(beta) + ggml_nelements(alpha) == ggml_nelements(mixed_ba));
  15191. ggml_tensor * alpha_softplus = softplus(alpha, model.layers[il].ssm_dt);
  15192. ggml_tensor * A_log_exp = ggml_exp(ctx0, model.layers[il].ssm_a); // A_log.exp()
  15193. ggml_tensor * gate_scaled = ggml_mul(ctx0, alpha_softplus, A_log_exp); // A_log.exp() * softplus
  15194. ggml_tensor * gate = ggml_scale(ctx0, gate_scaled, -1.0f); // - (A_log.exp() * softplus)
  15195. // Get convolution states from cache
  15196. ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
  15197. ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
  15198. // Build the convolution states tensor
  15199. ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
  15200. // Calculate convolution kernel size
  15201. const int64_t conv_kernel_size = model.layers[il].ssm_conv1d->ne[0];
  15202. // Calculate input dimensions for Qwen3Next
  15203. const int64_t input_dim = (head_k_dim * num_k_heads * 2) + (head_v_dim * num_v_heads);
  15204. // Reshape conv_states to [conv_kernel_size - 1, input_dim, n_seqs]
  15205. conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, input_dim, n_seqs);
  15206. cb(conv_states, "conv_states_reshaped", il);
  15207. // Combine query, key, value for convolution input
  15208. ggml_tensor * qkv_mixed = ggml_concat(ctx0, query, key, 1);
  15209. qkv_mixed = ggml_concat(ctx0, qkv_mixed, value_reshaped, 1);
  15210. // Reshape to [input_dim, n_seq_tokens, n_seqs] for concatenation
  15211. qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, input_dim, n_seq_tokens, n_seqs);
  15212. cb(qkv_mixed, "qkv_mixed_for_conv", il);
  15213. // Concatenate cached conv states with current input
  15214. // conv_states: [conv_kernel_size - 1, input_dim, n_seqs]
  15215. // qkv_mixed: [input_dim, n_seq_tokens, n_seqs]
  15216. // After transpose: [n_seq_tokens, input_dim, n_seqs]
  15217. ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, ggml_transpose(ctx0, qkv_mixed), 0);
  15218. cb(conv_input, "conv_input", il);
  15219. // Apply convolution
  15220. ggml_tensor * conv_output = ggml_ssm_conv(ctx0, conv_input, model.layers[il].ssm_conv1d);
  15221. cb(conv_output, "conv_output_raw", il);
  15222. if (model.layers[il].ssm_conv1d_b) {
  15223. conv_output = ggml_add(ctx0, conv_output, model.layers[il].ssm_conv1d_b);
  15224. cb(conv_output, "conv_output_bias", il);
  15225. }
  15226. conv_output = ggml_silu(ctx0, conv_output);
  15227. cb(conv_output, "conv_output_silu", il);
  15228. // Update convolution state cache
  15229. // Extract the last (conv_kernel_size - 1) states from conv_input
  15230. ggml_tensor * last_conv_states =
  15231. ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, input_dim, n_seqs, conv_input->nb[1],
  15232. conv_input->nb[2], n_seq_tokens * conv_input->nb[0]);
  15233. ggml_build_forward_expand(
  15234. gf, ggml_cpy(ctx0, last_conv_states,
  15235. ggml_view_1d(ctx0, conv_states_all, (conv_kernel_size - 1) * input_dim * n_seqs,
  15236. mctx_cur->get_head() * (conv_kernel_size - 1) * input_dim *
  15237. ggml_element_size(conv_states_all))));
  15238. cb(conv_states_all, "conv_states_updated", il);
  15239. // Reshape conv_output back to proper dimensions
  15240. conv_output = ggml_reshape_4d(ctx0, conv_output, input_dim, n_seqs, n_seq_tokens, 1);
  15241. cb(conv_output, "conv_output_reshaped", il);
  15242. conv_output = ggml_permute(ctx0, conv_output, 0, 2, 1, 3);
  15243. cb(conv_output, "conv_output_final", il);
  15244. // Extract the convolved Q, K, V from conv_output
  15245. ggml_tensor * q_conv = ggml_cont(ctx0, ggml_view_4d(ctx0, conv_output, head_k_dim, num_k_heads, n_tokens, n_seqs,
  15246. head_k_dim, conv_output->nb[1], conv_output->nb[2], 0));
  15247. cb(q_conv, "q_conv", il);
  15248. ggml_tensor * k_conv =
  15249. ggml_cont(ctx0, ggml_view_4d(ctx0, conv_output, head_k_dim, num_k_heads, n_tokens, n_seqs, head_k_dim,
  15250. conv_output->nb[1], conv_output->nb[2], head_k_dim * num_k_heads * ggml_element_size(conv_output)));
  15251. cb(q_conv, "k_conv", il);
  15252. ggml_tensor * v_conv =
  15253. ggml_cont(ctx0, ggml_view_4d(ctx0, conv_output, head_v_dim, num_v_heads, n_tokens, n_seqs, head_v_dim,
  15254. conv_output->nb[1], conv_output->nb[2], 2 * head_k_dim * num_k_heads * ggml_element_size(conv_output)));
  15255. cb(q_conv, "v_conv", il);
  15256. ggml_build_forward_expand(gf, ssm_states_all);
  15257. // Beta tensor
  15258. beta = ggml_reshape_3d(ctx0, beta, n_heads, n_tokens, n_seqs);
  15259. ggml_tensor * state = ggml_reshape_4d(ctx0, ssm_states_all, head_dim, head_dim * n_heads, 1, 1);
  15260. ggml_tensor * state_broadcast = ggml_repeat_4d(ctx0, state, head_dim, head_dim * n_heads, n_seqs, n_tokens);
  15261. ggml_tensor * target_gate = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_dim, n_heads, n_tokens, n_seqs);
  15262. ggml_tensor * gate_broadcast = ggml_reshape_4d(ctx0, gate, 1, n_heads, n_tokens, n_seqs);
  15263. gate = ggml_repeat(ctx0, gate_broadcast, target_gate);
  15264. // Call the new ggml_delta_net function with the corrected flow
  15265. ggml_tensor * output = ggml_delta_net(ctx0,
  15266. k_conv, // k tensor (already convolved)
  15267. v_conv, // v tensor (already convolved)
  15268. q_conv, // q tensor (already convolved)
  15269. gate, // g tensor
  15270. beta, // beta tensor
  15271. state_broadcast, // state tensor
  15272. true, // use_qk_l2norm
  15273. 1.0f // scale
  15274. );
  15275. cb(output, "delta_net_output", il);
  15276. // Extract the output part
  15277. ggml_tensor * attn_out = ggml_view_4d(ctx0, output, head_dim, n_heads, n_tokens, n_seqs, output->nb[0],
  15278. output->nb[1], output->nb[2], 0);
  15279. // Extract the new state
  15280. ggml_tensor * new_state =
  15281. ggml_view_4d(ctx0, output, head_dim, head_dim * n_heads, n_tokens, n_seqs, output->nb[0], output->nb[1],
  15282. output->nb[2], n_tokens * n_seqs * head_dim * n_heads * ggml_element_size(output));
  15283. // Only return the last recurrent state
  15284. struct ggml_tensor * state_reshaped =
  15285. ggml_cont_4d(ctx0, new_state, head_dim, head_dim, n_heads, n_tokens * n_seqs);
  15286. struct ggml_tensor * state_last = ggml_view_4d(
  15287. ctx0, state_reshaped, head_dim, head_dim, n_heads, 1, state_reshaped->nb[1], state_reshaped->nb[2],
  15288. state_reshaped->nb[3], head_dim * head_dim * n_heads * ((n_seqs * n_tokens) - 1));
  15289. // Update the recurrent states
  15290. ggml_build_forward_expand(gf, ggml_cpy(ctx0, state_last, ssm_states_all));
  15291. // Reshape both attn_out and z to 2D tensors for normalization
  15292. // attn_out: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
  15293. ggml_tensor * attn_out_2d =
  15294. ggml_reshape_2d(ctx0, ggml_cont(ctx0, attn_out), head_dim, n_heads * n_tokens * n_seqs);
  15295. // z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
  15296. ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z_reshaped, head_dim, n_heads * n_tokens * n_seqs);
  15297. // Apply gated normalization: self.norm(core_attn_out, z)
  15298. // This is Qwen3NextRMSNormGated which applies: RMSNorm(x) * silu(gate)
  15299. ggml_tensor * attn_out_norm = build_norm(attn_out_2d, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
  15300. // Apply silu gate: attn_out_norm * silu(z_2d)
  15301. ggml_tensor * z_silu = ggml_silu(ctx0, z_2d);
  15302. ggml_tensor * gated_output = ggml_mul(ctx0, attn_out_norm, z_silu);
  15303. // Reshape back to original dimensions: [n_heads * n_tokens * n_seqs, head_dim] -> [head_dim, n_heads, n_tokens, n_seqs]
  15304. ggml_tensor * gated_output_4d = ggml_reshape_4d(ctx0, gated_output, head_dim, n_heads, n_tokens, n_seqs);
  15305. // Final reshape: [head_dim, n_heads, n_tokens, n_seqs] -> [n_tokens, n_seqs, n_heads * head_dim]
  15306. ggml_tensor * final_output = ggml_reshape_3d(ctx0, gated_output_4d, n_heads * head_dim, n_tokens, n_seqs);
  15307. // Output projection
  15308. cur = build_lora_mm(model.layers[il].ssm_out, final_output);
  15309. cb(cur, "linear_attn_out", il);
  15310. // Reshape back to original dimensions
  15311. cur = ggml_cont(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens));
  15312. return cur;
  15313. }
  15314. ggml_tensor * build_layer_ffn(ggml_tensor * cur, const llama_model & model, const int il) {
  15315. // Check if this is an MoE layer
  15316. if (model.layers[il].ffn_gate_inp != nullptr) {
  15317. // MoE branch
  15318. ggml_tensor * moe_out = build_moe_ffn(cur,
  15319. model.layers[il].ffn_gate_inp,
  15320. model.layers[il].ffn_up_exps,
  15321. model.layers[il].ffn_gate_exps,
  15322. model.layers[il].ffn_down_exps,
  15323. nullptr,
  15324. n_expert, n_expert_used,
  15325. LLM_FFN_SILU, true,
  15326. false, 0.0,
  15327. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  15328. il);
  15329. cb(moe_out, "ffn_moe_out", il);
  15330. // Add shared experts if present
  15331. if (model.layers[il].ffn_up_shexp != nullptr) {
  15332. ggml_tensor * ffn_shexp = build_ffn(cur,
  15333. model.layers[il].ffn_up_shexp, NULL, NULL,
  15334. model.layers[il].ffn_gate_shexp, NULL, NULL,
  15335. model.layers[il].ffn_down_shexp, NULL, NULL,
  15336. NULL,
  15337. LLM_FFN_SILU, LLM_FFN_PAR, il);
  15338. cb(ffn_shexp, "ffn_shexp", il);
  15339. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  15340. cb(cur, "ffn_out", il);
  15341. } else {
  15342. cur = moe_out;
  15343. }
  15344. } else {
  15345. // Dense FFN branch
  15346. cur = build_ffn(cur,
  15347. model.layers[il].ffn_up, NULL, NULL,
  15348. model.layers[il].ffn_gate, NULL, NULL,
  15349. model.layers[il].ffn_down, NULL, NULL,
  15350. NULL,
  15351. LLM_FFN_SILU, LLM_FFN_PAR, il);
  15352. cb(cur, "ffn_out", il);
  15353. }
  15354. // Residual connection
  15355. cur = ggml_add(ctx0, cur, cur); // This should be the residual from before FFN
  15356. cb(cur, "ffn_residual", il);
  15357. return cur;
  15358. }
  15359. ggml_tensor * softplus(ggml_tensor * alpha, ggml_tensor * dt_bias) {
  15360. ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, dt_bias); // a + dt_bias
  15361. ggml_tensor * alpha_exp = ggml_exp(ctx0, alpha_biased); // exp(a + dt_bias)
  15362. ggml_tensor * one_plus_exp = ggml_scale_bias(ctx0, alpha_exp, 1.0f, 1.0f); // 1 + exp(a + dt_bias)
  15363. ggml_tensor * alpha_softplus = ggml_log(ctx0, one_plus_exp); // log(1 + exp(...))
  15364. return alpha_softplus;
  15365. }
  15366. };
  15367. llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
  15368. llama_memory_i * res;
  15369. switch (arch) {
  15370. // Models that need specific instantiation should be handled in the
  15371. // switch statement
  15372. case LLM_ARCH_BERT:
  15373. case LLM_ARCH_JINA_BERT_V2:
  15374. case LLM_ARCH_JINA_BERT_V3:
  15375. case LLM_ARCH_NOMIC_BERT:
  15376. case LLM_ARCH_NOMIC_BERT_MOE:
  15377. case LLM_ARCH_NEO_BERT:
  15378. case LLM_ARCH_WAVTOKENIZER_DEC:
  15379. //case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA]
  15380. case LLM_ARCH_DREAM:
  15381. case LLM_ARCH_LLADA:
  15382. case LLM_ARCH_LLADA_MOE:
  15383. {
  15384. res = nullptr;
  15385. } break;
  15386. // Models that need standard caching should rely on recurrent/hybrid
  15387. // checks
  15388. default:
  15389. {
  15390. if (llm_arch_is_recurrent(arch)) {
  15391. res = new llama_memory_recurrent(
  15392. *this,
  15393. GGML_TYPE_F32,
  15394. GGML_TYPE_F32,
  15395. cparams.offload_kqv,
  15396. std::max((uint32_t) 1, cparams.n_seq_max),
  15397. cparams.n_seq_max,
  15398. nullptr);
  15399. } else if (llm_arch_is_hybrid(arch)) {
  15400. // The main difference between hybrid architectures is the
  15401. // layer filters, so pick the right one here
  15402. llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
  15403. llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
  15404. if (arch == LLM_ARCH_FALCON_H1) {
  15405. filter_attn = [&](int32_t) { return true; };
  15406. filter_recr = [&](int32_t) { return true; };
  15407. } else if (arch == LLM_ARCH_NEMOTRON_H) {
  15408. filter_attn = [&](int32_t il) {
  15409. return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
  15410. };
  15411. filter_recr = [&](int32_t il) {
  15412. return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
  15413. };
  15414. }
  15415. const auto padding = llama_kv_cache::get_padding(cparams);
  15416. cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
  15417. res = new llama_memory_hybrid(
  15418. /* model */ *this,
  15419. /* attn_type_k */ params.type_k,
  15420. /* attn_type_v */ params.type_v,
  15421. /* attn_v_trans */ !cparams.flash_attn,
  15422. /* attn_kv_size */ cparams.n_ctx,
  15423. /* attn_n_pad */ padding,
  15424. /* attn_n_swa */ hparams.n_swa,
  15425. /* attn_swa_type */ hparams.swa_type,
  15426. /* recurrent_type_k */ GGML_TYPE_F32,
  15427. /* recurrent_type_v */ GGML_TYPE_F32,
  15428. /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
  15429. /* n_seq_max */ cparams.n_seq_max,
  15430. /* offload */ cparams.offload_kqv,
  15431. /* unified */ cparams.kv_unified,
  15432. /* filter_attn */ std::move(filter_attn),
  15433. /* filter_recr */ std::move(filter_recr));
  15434. } else {
  15435. const auto padding = llama_kv_cache::get_padding(cparams);
  15436. uint32_t n_ctx_per_stream = cparams.n_ctx;
  15437. if (!cparams.kv_unified) {
  15438. n_ctx_per_stream = (cparams.n_ctx + cparams.n_seq_max - 1)/cparams.n_seq_max;
  15439. n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
  15440. cparams.n_ctx = n_ctx_per_stream*cparams.n_seq_max;
  15441. } else {
  15442. n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
  15443. cparams.n_ctx = n_ctx_per_stream;
  15444. }
  15445. LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
  15446. llama_memory_i::layer_reuse_cb reuse = nullptr;
  15447. if (arch == LLM_ARCH_GEMMA3N) {
  15448. reuse = [&](int32_t il) {
  15449. if (il >= (int32_t) hparams.n_layer_kv_from_start) {
  15450. return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
  15451. }
  15452. return -1;
  15453. };
  15454. }
  15455. if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
  15456. GGML_ASSERT(hparams.is_swa_any());
  15457. res = new llama_kv_cache_iswa(
  15458. *this,
  15459. params.type_k,
  15460. params.type_v,
  15461. !cparams.flash_attn,
  15462. cparams.offload_kqv,
  15463. params.swa_full,
  15464. cparams.kv_unified,
  15465. n_ctx_per_stream,
  15466. cparams.n_seq_max,
  15467. cparams.n_ubatch,
  15468. padding,
  15469. nullptr,
  15470. reuse);
  15471. } else {
  15472. GGML_ASSERT(!hparams.is_swa_any());
  15473. res = new llama_kv_cache(
  15474. *this,
  15475. params.type_k,
  15476. params.type_v,
  15477. !cparams.flash_attn,
  15478. cparams.offload_kqv,
  15479. cparams.kv_unified,
  15480. n_ctx_per_stream,
  15481. cparams.n_seq_max,
  15482. padding,
  15483. hparams.n_swa,
  15484. hparams.swa_type,
  15485. nullptr,
  15486. nullptr);
  15487. }
  15488. }
  15489. }
  15490. }
  15491. return res;
  15492. }
  15493. ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
  15494. std::unique_ptr<llm_graph_context> llm;
  15495. switch (arch) {
  15496. case LLM_ARCH_LLAMA:
  15497. {
  15498. llm = std::make_unique<llm_build_llama>(*this, params);
  15499. } break;
  15500. case LLM_ARCH_LLAMA4:
  15501. {
  15502. if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
  15503. llm = std::make_unique<llm_build_llama>(*this, params);
  15504. } else {
  15505. llm = std::make_unique<llm_build_llama_iswa>(*this, params);
  15506. }
  15507. } break;
  15508. case LLM_ARCH_DECI:
  15509. {
  15510. llm = std::make_unique<llm_build_deci>(*this, params);
  15511. } break;
  15512. case LLM_ARCH_BAICHUAN:
  15513. {
  15514. llm = std::make_unique<llm_build_baichuan>(*this, params);
  15515. } break;
  15516. case LLM_ARCH_FALCON:
  15517. {
  15518. llm = std::make_unique<llm_build_falcon>(*this, params);
  15519. } break;
  15520. case LLM_ARCH_GROK:
  15521. {
  15522. llm = std::make_unique<llm_build_grok>(*this, params);
  15523. } break;
  15524. case LLM_ARCH_STARCODER:
  15525. {
  15526. llm = std::make_unique<llm_build_starcoder>(*this, params);
  15527. } break;
  15528. case LLM_ARCH_REFACT:
  15529. {
  15530. llm = std::make_unique<llm_build_refact>(*this, params);
  15531. } break;
  15532. case LLM_ARCH_BERT:
  15533. case LLM_ARCH_JINA_BERT_V2:
  15534. case LLM_ARCH_JINA_BERT_V3:
  15535. case LLM_ARCH_NOMIC_BERT:
  15536. case LLM_ARCH_NOMIC_BERT_MOE:
  15537. {
  15538. llm = std::make_unique<llm_build_bert>(*this, params);
  15539. } break;
  15540. case LLM_ARCH_NEO_BERT:
  15541. {
  15542. llm = std::make_unique<llm_build_neo_bert>(*this, params);
  15543. } break;
  15544. case LLM_ARCH_BLOOM:
  15545. {
  15546. llm = std::make_unique<llm_build_bloom>(*this, params);
  15547. } break;
  15548. case LLM_ARCH_MPT:
  15549. {
  15550. llm = std::make_unique<llm_build_mpt>(*this, params);
  15551. } break;
  15552. case LLM_ARCH_STABLELM:
  15553. {
  15554. llm = std::make_unique<llm_build_stablelm>(*this, params);
  15555. } break;
  15556. case LLM_ARCH_QWEN:
  15557. {
  15558. llm = std::make_unique<llm_build_qwen>(*this, params);
  15559. } break;
  15560. case LLM_ARCH_QWEN2:
  15561. {
  15562. llm = std::make_unique<llm_build_qwen2>(*this, params);
  15563. } break;
  15564. case LLM_ARCH_DREAM:
  15565. {
  15566. llm = std::make_unique<llm_build_dream>(*this, params);
  15567. }
  15568. break;
  15569. case LLM_ARCH_LLADA:
  15570. {
  15571. llm = std::make_unique<llm_build_llada>(*this, params);
  15572. }
  15573. break;
  15574. case LLM_ARCH_LLADA_MOE:
  15575. {
  15576. llm = std::make_unique<llm_build_llada_moe>(*this, params);
  15577. }
  15578. break;
  15579. case LLM_ARCH_QWEN2VL:
  15580. {
  15581. llm = std::make_unique<llm_build_qwen2vl>(*this, params);
  15582. } break;
  15583. case LLM_ARCH_QWEN2MOE:
  15584. {
  15585. llm = std::make_unique<llm_build_qwen2moe>(*this, params);
  15586. } break;
  15587. case LLM_ARCH_QWEN3:
  15588. {
  15589. llm = std::make_unique<llm_build_qwen3>(*this, params);
  15590. } break;
  15591. case LLM_ARCH_QWEN3MOE:
  15592. {
  15593. llm = std::make_unique<llm_build_qwen3moe>(*this, params);
  15594. } break;
  15595. case LLM_ARCH_PHI2:
  15596. {
  15597. llm = std::make_unique<llm_build_phi2>(*this, params);
  15598. } break;
  15599. case LLM_ARCH_PHI3:
  15600. case LLM_ARCH_PHIMOE:
  15601. {
  15602. if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
  15603. llm = std::make_unique<llm_build_phi3<true>> (*this, params);
  15604. } else {
  15605. llm = std::make_unique<llm_build_phi3<false>>(*this, params);
  15606. }
  15607. } break;
  15608. case LLM_ARCH_PLAMO:
  15609. {
  15610. llm = std::make_unique<llm_build_plamo>(*this, params);
  15611. } break;
  15612. case LLM_ARCH_PLAMO2:
  15613. {
  15614. llm = std::make_unique<llm_build_plamo2>(*this, params);
  15615. } break;
  15616. case LLM_ARCH_GPT2:
  15617. {
  15618. llm = std::make_unique<llm_build_gpt2>(*this, params);
  15619. } break;
  15620. case LLM_ARCH_CODESHELL:
  15621. {
  15622. llm = std::make_unique<llm_build_codeshell>(*this, params);
  15623. } break;
  15624. case LLM_ARCH_ORION:
  15625. {
  15626. llm = std::make_unique<llm_build_orion>(*this, params);
  15627. } break;
  15628. case LLM_ARCH_INTERNLM2:
  15629. {
  15630. llm = std::make_unique<llm_build_internlm2>(*this, params);
  15631. } break;
  15632. case LLM_ARCH_MINICPM3:
  15633. {
  15634. llm = std::make_unique<llm_build_minicpm3>(*this, params);
  15635. } break;
  15636. case LLM_ARCH_GEMMA:
  15637. {
  15638. llm = std::make_unique<llm_build_gemma>(*this, params);
  15639. } break;
  15640. case LLM_ARCH_GEMMA2:
  15641. {
  15642. llm = std::make_unique<llm_build_gemma2_iswa>(*this, params);
  15643. } break;
  15644. case LLM_ARCH_GEMMA3:
  15645. {
  15646. llm = std::make_unique<llm_build_gemma3_iswa>(*this, params);
  15647. } break;
  15648. case LLM_ARCH_GEMMA3N:
  15649. {
  15650. llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
  15651. } break;
  15652. case LLM_ARCH_GEMMA_EMBEDDING:
  15653. {
  15654. llm = std::make_unique<llm_build_gemma_embedding_iswa>(*this, params);
  15655. } break;
  15656. case LLM_ARCH_STARCODER2:
  15657. {
  15658. llm = std::make_unique<llm_build_starcoder2>(*this, params);
  15659. } break;
  15660. case LLM_ARCH_MAMBA:
  15661. case LLM_ARCH_MAMBA2:
  15662. {
  15663. llm = std::make_unique<llm_build_mamba>(*this, params);
  15664. } break;
  15665. case LLM_ARCH_JAMBA:
  15666. {
  15667. llm = std::make_unique<llm_build_jamba>(*this, params);
  15668. } break;
  15669. case LLM_ARCH_XVERSE:
  15670. {
  15671. llm = std::make_unique<llm_build_xverse>(*this, params);
  15672. } break;
  15673. case LLM_ARCH_COMMAND_R:
  15674. {
  15675. llm = std::make_unique<llm_build_command_r>(*this, params);
  15676. } break;
  15677. case LLM_ARCH_COHERE2:
  15678. {
  15679. llm = std::make_unique<llm_build_cohere2_iswa>(*this, params);
  15680. } break;
  15681. case LLM_ARCH_DBRX:
  15682. {
  15683. llm = std::make_unique<llm_build_dbrx>(*this, params);
  15684. } break;
  15685. case LLM_ARCH_OLMO:
  15686. {
  15687. llm = std::make_unique<llm_build_olmo>(*this, params);
  15688. } break;
  15689. case LLM_ARCH_OLMO2:
  15690. {
  15691. if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
  15692. llm = std::make_unique<llm_build_olmo2<true>>(*this, params);
  15693. } else {
  15694. llm = std::make_unique<llm_build_olmo2<false>>(*this, params);
  15695. }
  15696. } break;
  15697. case LLM_ARCH_OLMOE:
  15698. {
  15699. llm = std::make_unique<llm_build_olmoe>(*this, params);
  15700. } break;
  15701. case LLM_ARCH_OPENELM:
  15702. {
  15703. llm = std::make_unique<llm_build_openelm>(*this, params);
  15704. } break;
  15705. case LLM_ARCH_GPTNEOX:
  15706. {
  15707. llm = std::make_unique<llm_build_gptneox>(*this, params);
  15708. } break;
  15709. case LLM_ARCH_ARCTIC:
  15710. {
  15711. llm = std::make_unique<llm_build_arctic>(*this, params);
  15712. } break;
  15713. case LLM_ARCH_DEEPSEEK:
  15714. {
  15715. llm = std::make_unique<llm_build_deepseek>(*this, params);
  15716. } break;
  15717. case LLM_ARCH_DEEPSEEK2:
  15718. {
  15719. llm = std::make_unique<llm_build_deepseek2>(*this, params);
  15720. } break;
  15721. case LLM_ARCH_CHATGLM:
  15722. {
  15723. llm = std::make_unique<llm_build_chatglm>(*this, params);
  15724. } break;
  15725. case LLM_ARCH_GLM4:
  15726. {
  15727. llm = std::make_unique<llm_build_glm4>(*this, params);
  15728. } break;
  15729. case LLM_ARCH_GLM4_MOE:
  15730. {
  15731. llm = std::make_unique<llm_build_glm4_moe>(*this, params);
  15732. } break;
  15733. case LLM_ARCH_BITNET:
  15734. {
  15735. llm = std::make_unique<llm_build_bitnet>(*this, params);
  15736. } break;
  15737. case LLM_ARCH_T5:
  15738. {
  15739. switch (params.gtype) {
  15740. case LLM_GRAPH_TYPE_ENCODER:
  15741. llm = std::make_unique<llm_build_t5_enc>(*this, params);
  15742. break;
  15743. case LLM_GRAPH_TYPE_DEFAULT:
  15744. case LLM_GRAPH_TYPE_DECODER:
  15745. llm = std::make_unique<llm_build_t5_dec>(*this, params);
  15746. break;
  15747. default:
  15748. GGML_ABORT("invalid graph type");
  15749. };
  15750. } break;
  15751. case LLM_ARCH_T5ENCODER:
  15752. {
  15753. llm = std::make_unique<llm_build_t5_enc>(*this, params);
  15754. }
  15755. break;
  15756. case LLM_ARCH_JAIS:
  15757. {
  15758. llm = std::make_unique<llm_build_jais>(*this, params);
  15759. } break;
  15760. case LLM_ARCH_NEMOTRON:
  15761. {
  15762. llm = std::make_unique<llm_build_nemotron>(*this, params);
  15763. } break;
  15764. case LLM_ARCH_NEMOTRON_H:
  15765. {
  15766. llm = std::make_unique<llm_build_nemotron_h>(*this, params);
  15767. } break;
  15768. case LLM_ARCH_EXAONE:
  15769. {
  15770. llm = std::make_unique<llm_build_exaone>(*this, params);
  15771. } break;
  15772. case LLM_ARCH_EXAONE4:
  15773. {
  15774. if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
  15775. llm = std::make_unique<llm_build_exaone4<true>>(*this, params);
  15776. } else {
  15777. llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
  15778. }
  15779. } break;
  15780. case LLM_ARCH_RWKV6:
  15781. {
  15782. llm = std::make_unique<llm_build_rwkv6>(*this, params);
  15783. } break;
  15784. case LLM_ARCH_RWKV6QWEN2:
  15785. {
  15786. llm = std::make_unique<llm_build_rwkv6qwen2>(*this, params);
  15787. } break;
  15788. case LLM_ARCH_RWKV7:
  15789. {
  15790. llm = std::make_unique<llm_build_rwkv7>(*this, params);
  15791. } break;
  15792. case LLM_ARCH_ARWKV7:
  15793. {
  15794. llm = std::make_unique<llm_build_arwkv7>(*this, params);
  15795. } break;
  15796. case LLM_ARCH_GRANITE:
  15797. case LLM_ARCH_GRANITE_MOE:
  15798. case LLM_ARCH_MINICPM:
  15799. {
  15800. llm = std::make_unique<llm_build_granite>(*this, params);
  15801. } break;
  15802. case LLM_ARCH_GRANITE_HYBRID:
  15803. {
  15804. llm = std::make_unique<llm_build_granite_hybrid>(*this, params);
  15805. } break;
  15806. case LLM_ARCH_CHAMELEON:
  15807. {
  15808. llm = std::make_unique<llm_build_chameleon>(*this, params);
  15809. } break;
  15810. case LLM_ARCH_WAVTOKENIZER_DEC:
  15811. {
  15812. llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
  15813. } break;
  15814. case LLM_ARCH_PLM:
  15815. {
  15816. llm = std::make_unique<llm_build_plm>(*this, params);
  15817. } break;
  15818. case LLM_ARCH_BAILINGMOE:
  15819. {
  15820. llm = std::make_unique<llm_build_bailingmoe>(*this, params);
  15821. } break;
  15822. case LLM_ARCH_SEED_OSS:
  15823. {
  15824. llm = std::make_unique<llm_build_seed_oss>(*this, params);
  15825. } break;
  15826. case LLM_ARCH_DOTS1:
  15827. {
  15828. llm = std::make_unique<llm_build_dots1>(*this, params);
  15829. } break;
  15830. case LLM_ARCH_ARCEE:
  15831. {
  15832. llm = std::make_unique<llm_build_arcee>(*this, params);
  15833. } break;
  15834. case LLM_ARCH_ERNIE4_5:
  15835. {
  15836. llm = std::make_unique<llm_build_ernie4_5>(*this, params);
  15837. } break;
  15838. case LLM_ARCH_ERNIE4_5_MOE:
  15839. {
  15840. llm = std::make_unique<llm_build_ernie4_5_moe>(*this, params);
  15841. } break;
  15842. case LLM_ARCH_HUNYUAN_MOE:
  15843. {
  15844. llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
  15845. } break;
  15846. case LLM_ARCH_HUNYUAN_DENSE:
  15847. {
  15848. llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
  15849. } break;
  15850. case LLM_ARCH_SMOLLM3:
  15851. {
  15852. llm = std::make_unique<llm_build_smollm3>(*this, params);
  15853. } break;
  15854. case LLM_ARCH_OPENAI_MOE:
  15855. {
  15856. llm = std::make_unique<llm_build_openai_moe_iswa>(*this, params);
  15857. } break;
  15858. case LLM_ARCH_FALCON_H1:
  15859. {
  15860. llm = std::make_unique<llm_build_falcon_h1>(*this, params);
  15861. } break;
  15862. case LLM_ARCH_LFM2:
  15863. {
  15864. llm = std::make_unique<llm_build_lfm2>(*this, params);
  15865. } break;
  15866. case LLM_ARCH_SMALLTHINKER:
  15867. {
  15868. if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
  15869. llm = std::make_unique<llm_build_smallthinker<true>> (*this, params);
  15870. } else {
  15871. llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
  15872. }
  15873. } break;
  15874. case LLM_ARCH_QWEN3NEXT:
  15875. {
  15876. llm = std::make_unique<llm_build_qwen3next>(*this, params);
  15877. } break;
  15878. default:
  15879. GGML_ABORT("fatal error");
  15880. }
  15881. // add on pooling layer
  15882. llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
  15883. return llm->res->get_gf();
  15884. }
  15885. //
  15886. // interface implementation
  15887. //
  15888. llama_model_params llama_model_default_params() {
  15889. llama_model_params result = {
  15890. /*.devices =*/ nullptr,
  15891. /*.tensor_buft_overrides =*/ nullptr,
  15892. /*.n_gpu_layers =*/ 999,
  15893. /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
  15894. /*.main_gpu =*/ 0,
  15895. /*.tensor_split =*/ nullptr,
  15896. /*.progress_callback =*/ nullptr,
  15897. /*.progress_callback_user_data =*/ nullptr,
  15898. /*.kv_overrides =*/ nullptr,
  15899. /*.vocab_only =*/ false,
  15900. /*.use_mmap =*/ true,
  15901. /*.use_mlock =*/ false,
  15902. /*.check_tensors =*/ false,
  15903. /*.use_extra_bufts =*/ true,
  15904. };
  15905. return result;
  15906. }
  15907. const llama_vocab * llama_model_get_vocab(const llama_model * model) {
  15908. return &model->vocab;
  15909. }
  15910. void llama_free_model(llama_model * model) {
  15911. llama_model_free(model);
  15912. }
  15913. void llama_model_free(llama_model * model) {
  15914. delete model;
  15915. }
  15916. int32_t llama_model_n_ctx_train(const llama_model * model) {
  15917. return model->hparams.n_ctx_train;
  15918. }
  15919. int32_t llama_model_n_embd(const llama_model * model) {
  15920. return model->hparams.n_embd;
  15921. }
  15922. int32_t llama_model_n_layer(const llama_model * model) {
  15923. return model->hparams.n_layer;
  15924. }
  15925. int32_t llama_model_n_head(const llama_model * model) {
  15926. return model->hparams.n_head();
  15927. }
  15928. int32_t llama_model_n_head_kv(const llama_model * model) {
  15929. return model->hparams.n_head_kv();
  15930. }
  15931. int32_t llama_model_n_swa(const llama_model * model) {
  15932. return model->hparams.n_swa;
  15933. }
  15934. uint32_t llama_model_n_cls_out(const struct llama_model * model) {
  15935. return model->hparams.n_cls_out;
  15936. }
  15937. const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
  15938. if (i < model->classifier_labels.size()) {
  15939. return model->classifier_labels[i].c_str();
  15940. }
  15941. return nullptr;
  15942. }
  15943. // deprecated
  15944. int32_t llama_n_ctx_train(const llama_model * model) {
  15945. return llama_model_n_ctx_train(model);
  15946. }
  15947. // deprecated
  15948. int32_t llama_n_embd(const llama_model * model) {
  15949. return llama_model_n_embd(model);
  15950. }
  15951. // deprecated
  15952. int32_t llama_n_layer(const llama_model * model) {
  15953. return llama_model_n_layer(model);
  15954. }
  15955. // deprecated
  15956. int32_t llama_n_head(const llama_model * model) {
  15957. return llama_model_n_head(model);
  15958. }
  15959. llama_rope_type llama_model_rope_type(const llama_model * model) {
  15960. switch (model->arch) {
  15961. // these models do not use RoPE
  15962. case LLM_ARCH_GPT2:
  15963. case LLM_ARCH_GPTJ:
  15964. case LLM_ARCH_MPT:
  15965. case LLM_ARCH_REFACT:
  15966. case LLM_ARCH_BLOOM:
  15967. case LLM_ARCH_MAMBA:
  15968. case LLM_ARCH_MAMBA2:
  15969. case LLM_ARCH_JAMBA:
  15970. case LLM_ARCH_JINA_BERT_V2:
  15971. case LLM_ARCH_T5:
  15972. case LLM_ARCH_T5ENCODER:
  15973. case LLM_ARCH_JAIS:
  15974. case LLM_ARCH_RWKV6:
  15975. case LLM_ARCH_RWKV6QWEN2:
  15976. case LLM_ARCH_RWKV7:
  15977. case LLM_ARCH_ARWKV7:
  15978. case LLM_ARCH_WAVTOKENIZER_DEC:
  15979. case LLM_ARCH_NEMOTRON_H:
  15980. return LLAMA_ROPE_TYPE_NONE;
  15981. // use what we call a normal RoPE, operating on pairs of consecutive head values
  15982. case LLM_ARCH_LLAMA:
  15983. case LLM_ARCH_LLADA:
  15984. case LLM_ARCH_LLAMA4:
  15985. case LLM_ARCH_DECI:
  15986. case LLM_ARCH_BAICHUAN:
  15987. case LLM_ARCH_STARCODER:
  15988. case LLM_ARCH_INTERNLM2:
  15989. case LLM_ARCH_MINICPM:
  15990. case LLM_ARCH_XVERSE:
  15991. case LLM_ARCH_COMMAND_R:
  15992. case LLM_ARCH_COHERE2:
  15993. case LLM_ARCH_OLMO:
  15994. case LLM_ARCH_ARCTIC:
  15995. case LLM_ARCH_DEEPSEEK:
  15996. case LLM_ARCH_DEEPSEEK2:
  15997. case LLM_ARCH_PLM:
  15998. case LLM_ARCH_CHATGLM:
  15999. case LLM_ARCH_GLM4:
  16000. case LLM_ARCH_GRANITE:
  16001. case LLM_ARCH_GRANITE_MOE:
  16002. case LLM_ARCH_GRANITE_HYBRID:
  16003. case LLM_ARCH_CHAMELEON:
  16004. case LLM_ARCH_BAILINGMOE:
  16005. case LLM_ARCH_NEO_BERT:
  16006. case LLM_ARCH_SMOLLM3:
  16007. case LLM_ARCH_ARCEE:
  16008. case LLM_ARCH_ERNIE4_5:
  16009. case LLM_ARCH_ERNIE4_5_MOE:
  16010. return LLAMA_ROPE_TYPE_NORM;
  16011. // the pairs of head values are offset by n_rot/2
  16012. case LLM_ARCH_FALCON:
  16013. case LLM_ARCH_FALCON_H1:
  16014. case LLM_ARCH_GROK:
  16015. case LLM_ARCH_DBRX:
  16016. case LLM_ARCH_BERT:
  16017. case LLM_ARCH_JINA_BERT_V3:
  16018. case LLM_ARCH_NOMIC_BERT:
  16019. case LLM_ARCH_NOMIC_BERT_MOE:
  16020. case LLM_ARCH_STABLELM:
  16021. case LLM_ARCH_BITNET:
  16022. case LLM_ARCH_QWEN:
  16023. case LLM_ARCH_QWEN2:
  16024. case LLM_ARCH_DREAM:
  16025. case LLM_ARCH_QWEN2MOE:
  16026. case LLM_ARCH_QWEN3:
  16027. case LLM_ARCH_QWEN3MOE:
  16028. case LLM_ARCH_QWEN3NEXT:
  16029. case LLM_ARCH_LLADA_MOE:
  16030. case LLM_ARCH_OLMO2:
  16031. case LLM_ARCH_OLMOE:
  16032. case LLM_ARCH_PHI2:
  16033. case LLM_ARCH_PHI3:
  16034. case LLM_ARCH_PHIMOE:
  16035. case LLM_ARCH_PLAMO:
  16036. case LLM_ARCH_PLAMO2:
  16037. case LLM_ARCH_GEMMA:
  16038. case LLM_ARCH_GEMMA2:
  16039. case LLM_ARCH_GEMMA3:
  16040. case LLM_ARCH_GEMMA3N:
  16041. case LLM_ARCH_GEMMA_EMBEDDING:
  16042. case LLM_ARCH_STARCODER2:
  16043. case LLM_ARCH_OPENELM:
  16044. case LLM_ARCH_GPTNEOX:
  16045. case LLM_ARCH_CODESHELL:
  16046. case LLM_ARCH_ORION:
  16047. case LLM_ARCH_NEMOTRON:
  16048. case LLM_ARCH_EXAONE:
  16049. case LLM_ARCH_EXAONE4:
  16050. case LLM_ARCH_MINICPM3:
  16051. case LLM_ARCH_DOTS1:
  16052. case LLM_ARCH_HUNYUAN_MOE:
  16053. case LLM_ARCH_OPENAI_MOE:
  16054. case LLM_ARCH_HUNYUAN_DENSE:
  16055. case LLM_ARCH_LFM2:
  16056. case LLM_ARCH_SMALLTHINKER:
  16057. case LLM_ARCH_GLM4_MOE:
  16058. case LLM_ARCH_SEED_OSS:
  16059. return LLAMA_ROPE_TYPE_NEOX;
  16060. case LLM_ARCH_QWEN2VL:
  16061. return LLAMA_ROPE_TYPE_MROPE;
  16062. // all model arches should be listed explicitly here
  16063. case LLM_ARCH_UNKNOWN:
  16064. GGML_ABORT("unknown architecture");
  16065. }
  16066. return LLAMA_ROPE_TYPE_NONE;
  16067. }
  16068. float llama_model_rope_freq_scale_train(const llama_model * model) {
  16069. return model->hparams.rope_freq_scale_train;
  16070. }
  16071. int32_t llama_model_meta_val_str(const llama_model * model, const char * key, char * buf, size_t buf_size) {
  16072. const auto & it = model->gguf_kv.find(key);
  16073. if (it == model->gguf_kv.end()) {
  16074. if (buf_size > 0) {
  16075. buf[0] = '\0';
  16076. }
  16077. return -1;
  16078. }
  16079. return snprintf(buf, buf_size, "%s", it->second.c_str());
  16080. }
  16081. int32_t llama_model_meta_count(const llama_model * model) {
  16082. return (int)model->gguf_kv.size();
  16083. }
  16084. int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
  16085. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  16086. if (buf_size > 0) {
  16087. buf[0] = '\0';
  16088. }
  16089. return -1;
  16090. }
  16091. auto it = model->gguf_kv.begin();
  16092. std::advance(it, i);
  16093. return snprintf(buf, buf_size, "%s", it->first.c_str());
  16094. }
  16095. int32_t llama_model_meta_val_str_by_index(const llama_model * model, int32_t i, char * buf, size_t buf_size) {
  16096. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  16097. if (buf_size > 0) {
  16098. buf[0] = '\0';
  16099. }
  16100. return -1;
  16101. }
  16102. auto it = model->gguf_kv.begin();
  16103. std::advance(it, i);
  16104. return snprintf(buf, buf_size, "%s", it->second.c_str());
  16105. }
  16106. int32_t llama_model_desc(const llama_model * model, char * buf, size_t buf_size) {
  16107. return snprintf(buf, buf_size, "%s", model->desc().c_str());
  16108. }
  16109. uint64_t llama_model_size(const llama_model * model) {
  16110. return model->size();
  16111. }
  16112. const char * llama_model_chat_template(const llama_model * model, const char * name) {
  16113. const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
  16114. : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
  16115. const auto & it = model->gguf_kv.find(key);
  16116. if (it == model->gguf_kv.end()) {
  16117. // one-off fix for very popular models (so we are not flooded with issues)
  16118. // do not extend this list unless absolutely necessary
  16119. // Mistral-Small-2503 does not have built-in chat template
  16120. llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
  16121. if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
  16122. return "mistral-v7-tekken";
  16123. }
  16124. return nullptr;
  16125. }
  16126. return it->second.c_str();
  16127. }
  16128. uint64_t llama_model_n_params(const llama_model * model) {
  16129. return model->n_elements();
  16130. }
  16131. bool llama_model_has_encoder(const llama_model * model) {
  16132. switch (model->arch) {
  16133. case LLM_ARCH_T5: return true;
  16134. case LLM_ARCH_T5ENCODER: return true;
  16135. default: return false;
  16136. }
  16137. }
  16138. bool llama_model_has_decoder(const llama_model * model) {
  16139. switch (model->arch) {
  16140. case LLM_ARCH_T5ENCODER: return false;
  16141. default: return true;
  16142. }
  16143. }
  16144. llama_token llama_model_decoder_start_token(const llama_model * model) {
  16145. return model->hparams.dec_start_token_id;
  16146. }
  16147. bool llama_model_is_recurrent(const llama_model * model) {
  16148. return llm_arch_is_recurrent(model->arch);
  16149. }
  16150. bool llama_model_is_diffusion(const llama_model * model) {
  16151. return llm_arch_is_diffusion(model->arch);
  16152. }
  16153. const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
  16154. return model->tensors_by_name;
  16155. }