llama-model.cpp 841 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558855985608561856285638564856585668567856885698570857185728573857485758576857785788579858085818582858385848585858685878588858985908591859285938594859585968597859885998600860186028603860486058606860786088609861086118612861386148615861686178618861986208621862286238624862586268627862886298630863186328633863486358636863786388639864086418642864386448645864686478648864986508651865286538654865586568657865886598660866186628663866486658666866786688669867086718672867386748675867686778678867986808681868286838684868586868687868886898690869186928693869486958696869786988699870087018702870387048705870687078708870987108711871287138714871587168717871887198720872187228723872487258726872787288729873087318732873387348735873687378738873987408741874287438744874587468747874887498750875187528753875487558756875787588759876087618762876387648765876687678768876987708771877287738774877587768777877887798780878187828783878487858786878787888789879087918792879387948795879687978798879988008801880288038804880588068807880888098810881188128813881488158816881788188819882088218822882388248825882688278828882988308831883288338834883588368837883888398840884188428843884488458846884788488849885088518852885388548855885688578858885988608861886288638864886588668867886888698870887188728873887488758876887788788879888088818882888388848885888688878888888988908891889288938894889588968897889888998900890189028903890489058906890789088909891089118912891389148915891689178918891989208921892289238924892589268927892889298930893189328933893489358936893789388939894089418942894389448945894689478948894989508951895289538954895589568957895889598960896189628963896489658966896789688969897089718972897389748975897689778978897989808981898289838984898589868987898889898990899189928993899489958996899789988999900090019002900390049005900690079008900990109011901290139014901590169017901890199020902190229023902490259026902790289029903090319032903390349035903690379038903990409041904290439044904590469047904890499050905190529053905490559056905790589059906090619062906390649065906690679068906990709071907290739074907590769077907890799080908190829083908490859086908790889089909090919092909390949095909690979098909991009101910291039104910591069107910891099110911191129113911491159116911791189119912091219122912391249125912691279128912991309131913291339134913591369137913891399140914191429143914491459146914791489149915091519152915391549155915691579158915991609161916291639164916591669167916891699170917191729173917491759176917791789179918091819182918391849185918691879188918991909191919291939194919591969197919891999200920192029203920492059206920792089209921092119212921392149215921692179218921992209221922292239224922592269227922892299230923192329233923492359236923792389239924092419242924392449245924692479248924992509251925292539254925592569257925892599260926192629263926492659266926792689269927092719272927392749275927692779278927992809281928292839284928592869287928892899290929192929293929492959296929792989299930093019302930393049305930693079308930993109311931293139314931593169317931893199320932193229323932493259326932793289329933093319332933393349335933693379338933993409341934293439344934593469347934893499350935193529353935493559356935793589359936093619362936393649365936693679368936993709371937293739374937593769377937893799380938193829383938493859386938793889389939093919392939393949395939693979398939994009401940294039404940594069407940894099410941194129413941494159416941794189419942094219422942394249425942694279428942994309431943294339434943594369437943894399440944194429443944494459446944794489449945094519452945394549455945694579458945994609461946294639464946594669467946894699470947194729473947494759476947794789479948094819482948394849485948694879488948994909491949294939494949594969497949894999500950195029503950495059506950795089509951095119512951395149515951695179518951995209521952295239524952595269527952895299530953195329533953495359536953795389539954095419542954395449545954695479548954995509551955295539554955595569557955895599560956195629563956495659566956795689569957095719572957395749575957695779578957995809581958295839584958595869587958895899590959195929593959495959596959795989599960096019602960396049605960696079608960996109611961296139614961596169617961896199620962196229623962496259626962796289629963096319632963396349635963696379638963996409641964296439644964596469647964896499650965196529653965496559656965796589659966096619662966396649665966696679668966996709671967296739674967596769677967896799680968196829683968496859686968796889689969096919692969396949695969696979698969997009701970297039704970597069707970897099710971197129713971497159716971797189719972097219722972397249725972697279728972997309731973297339734973597369737973897399740974197429743974497459746974797489749975097519752975397549755975697579758975997609761976297639764976597669767976897699770977197729773977497759776977797789779978097819782978397849785978697879788978997909791979297939794979597969797979897999800980198029803980498059806980798089809981098119812981398149815981698179818981998209821982298239824982598269827982898299830983198329833983498359836983798389839984098419842984398449845984698479848984998509851985298539854985598569857985898599860986198629863986498659866986798689869987098719872987398749875987698779878987998809881988298839884988598869887988898899890989198929893989498959896989798989899990099019902990399049905990699079908990999109911991299139914991599169917991899199920992199229923992499259926992799289929993099319932993399349935993699379938993999409941994299439944994599469947994899499950995199529953995499559956995799589959996099619962996399649965996699679968996999709971997299739974997599769977997899799980998199829983998499859986998799889989999099919992999399949995999699979998999910000100011000210003100041000510006100071000810009100101001110012100131001410015100161001710018100191002010021100221002310024100251002610027100281002910030100311003210033100341003510036100371003810039100401004110042100431004410045100461004710048100491005010051100521005310054100551005610057100581005910060100611006210063100641006510066100671006810069100701007110072100731007410075100761007710078100791008010081100821008310084100851008610087100881008910090100911009210093100941009510096100971009810099101001010110102101031010410105101061010710108101091011010111101121011310114101151011610117101181011910120101211012210123101241012510126101271012810129101301013110132101331013410135101361013710138101391014010141101421014310144101451014610147101481014910150101511015210153101541015510156101571015810159101601016110162101631016410165101661016710168101691017010171101721017310174101751017610177101781017910180101811018210183101841018510186101871018810189101901019110192101931019410195101961019710198101991020010201102021020310204102051020610207102081020910210102111021210213102141021510216102171021810219102201022110222102231022410225102261022710228102291023010231102321023310234102351023610237102381023910240102411024210243102441024510246102471024810249102501025110252102531025410255102561025710258102591026010261102621026310264102651026610267102681026910270102711027210273102741027510276102771027810279102801028110282102831028410285102861028710288102891029010291102921029310294102951029610297102981029910300103011030210303103041030510306103071030810309103101031110312103131031410315103161031710318103191032010321103221032310324103251032610327103281032910330103311033210333103341033510336103371033810339103401034110342103431034410345103461034710348103491035010351103521035310354103551035610357103581035910360103611036210363103641036510366103671036810369103701037110372103731037410375103761037710378103791038010381103821038310384103851038610387103881038910390103911039210393103941039510396103971039810399104001040110402104031040410405104061040710408104091041010411104121041310414104151041610417104181041910420104211042210423104241042510426104271042810429104301043110432104331043410435104361043710438104391044010441104421044310444104451044610447104481044910450104511045210453104541045510456104571045810459104601046110462104631046410465104661046710468104691047010471104721047310474104751047610477104781047910480104811048210483104841048510486104871048810489104901049110492104931049410495104961049710498104991050010501105021050310504105051050610507105081050910510105111051210513105141051510516105171051810519105201052110522105231052410525105261052710528105291053010531105321053310534105351053610537105381053910540105411054210543105441054510546105471054810549105501055110552105531055410555105561055710558105591056010561105621056310564105651056610567105681056910570105711057210573105741057510576105771057810579105801058110582105831058410585105861058710588105891059010591105921059310594105951059610597105981059910600106011060210603106041060510606106071060810609106101061110612106131061410615106161061710618106191062010621106221062310624106251062610627106281062910630106311063210633106341063510636106371063810639106401064110642106431064410645106461064710648106491065010651106521065310654106551065610657106581065910660106611066210663106641066510666106671066810669106701067110672106731067410675106761067710678106791068010681106821068310684106851068610687106881068910690106911069210693106941069510696106971069810699107001070110702107031070410705107061070710708107091071010711107121071310714107151071610717107181071910720107211072210723107241072510726107271072810729107301073110732107331073410735107361073710738107391074010741107421074310744107451074610747107481074910750107511075210753107541075510756107571075810759107601076110762107631076410765107661076710768107691077010771107721077310774107751077610777107781077910780107811078210783107841078510786107871078810789107901079110792107931079410795107961079710798107991080010801108021080310804108051080610807108081080910810108111081210813108141081510816108171081810819108201082110822108231082410825108261082710828108291083010831108321083310834108351083610837108381083910840108411084210843108441084510846108471084810849108501085110852108531085410855108561085710858108591086010861108621086310864108651086610867108681086910870108711087210873108741087510876108771087810879108801088110882108831088410885108861088710888108891089010891108921089310894108951089610897108981089910900109011090210903109041090510906109071090810909109101091110912109131091410915109161091710918109191092010921109221092310924109251092610927109281092910930109311093210933109341093510936109371093810939109401094110942109431094410945109461094710948109491095010951109521095310954109551095610957109581095910960109611096210963109641096510966109671096810969109701097110972109731097410975109761097710978109791098010981109821098310984109851098610987109881098910990109911099210993109941099510996109971099810999110001100111002110031100411005110061100711008110091101011011110121101311014110151101611017110181101911020110211102211023110241102511026110271102811029110301103111032110331103411035110361103711038110391104011041110421104311044110451104611047110481104911050110511105211053110541105511056110571105811059110601106111062110631106411065110661106711068110691107011071110721107311074110751107611077110781107911080110811108211083110841108511086110871108811089110901109111092110931109411095110961109711098110991110011101111021110311104111051110611107111081110911110111111111211113111141111511116111171111811119111201112111122111231112411125111261112711128111291113011131111321113311134111351113611137111381113911140111411114211143111441114511146111471114811149111501115111152111531115411155111561115711158111591116011161111621116311164111651116611167111681116911170111711117211173111741117511176111771117811179111801118111182111831118411185111861118711188111891119011191111921119311194111951119611197111981119911200112011120211203112041120511206112071120811209112101121111212112131121411215112161121711218112191122011221112221122311224112251122611227112281122911230112311123211233112341123511236112371123811239112401124111242112431124411245112461124711248112491125011251112521125311254112551125611257112581125911260112611126211263112641126511266112671126811269112701127111272112731127411275112761127711278112791128011281112821128311284112851128611287112881128911290112911129211293112941129511296112971129811299113001130111302113031130411305113061130711308113091131011311113121131311314113151131611317113181131911320113211132211323113241132511326113271132811329113301133111332113331133411335113361133711338113391134011341113421134311344113451134611347113481134911350113511135211353113541135511356113571135811359113601136111362113631136411365113661136711368113691137011371113721137311374113751137611377113781137911380113811138211383113841138511386113871138811389113901139111392113931139411395113961139711398113991140011401114021140311404114051140611407114081140911410114111141211413114141141511416114171141811419114201142111422114231142411425114261142711428114291143011431114321143311434114351143611437114381143911440114411144211443114441144511446114471144811449114501145111452114531145411455114561145711458114591146011461114621146311464114651146611467114681146911470114711147211473114741147511476114771147811479114801148111482114831148411485114861148711488114891149011491114921149311494114951149611497114981149911500115011150211503115041150511506115071150811509115101151111512115131151411515115161151711518115191152011521115221152311524115251152611527115281152911530115311153211533115341153511536115371153811539115401154111542115431154411545115461154711548115491155011551115521155311554115551155611557115581155911560115611156211563115641156511566115671156811569115701157111572115731157411575115761157711578115791158011581115821158311584115851158611587115881158911590115911159211593115941159511596115971159811599116001160111602116031160411605116061160711608116091161011611116121161311614116151161611617116181161911620116211162211623116241162511626116271162811629116301163111632116331163411635116361163711638116391164011641116421164311644116451164611647116481164911650116511165211653116541165511656116571165811659116601166111662116631166411665116661166711668116691167011671116721167311674116751167611677116781167911680116811168211683116841168511686116871168811689116901169111692116931169411695116961169711698116991170011701117021170311704117051170611707117081170911710117111171211713117141171511716117171171811719117201172111722117231172411725117261172711728117291173011731117321173311734117351173611737117381173911740117411174211743117441174511746117471174811749117501175111752117531175411755117561175711758117591176011761117621176311764117651176611767117681176911770117711177211773117741177511776117771177811779117801178111782117831178411785117861178711788117891179011791117921179311794117951179611797117981179911800118011180211803118041180511806118071180811809118101181111812118131181411815118161181711818118191182011821118221182311824118251182611827118281182911830118311183211833118341183511836118371183811839118401184111842118431184411845118461184711848118491185011851118521185311854118551185611857118581185911860118611186211863118641186511866118671186811869118701187111872118731187411875118761187711878118791188011881118821188311884118851188611887118881188911890118911189211893118941189511896118971189811899119001190111902119031190411905119061190711908119091191011911119121191311914119151191611917119181191911920119211192211923119241192511926119271192811929119301193111932119331193411935119361193711938119391194011941119421194311944119451194611947119481194911950119511195211953119541195511956119571195811959119601196111962119631196411965119661196711968119691197011971119721197311974119751197611977119781197911980119811198211983119841198511986119871198811989119901199111992119931199411995119961199711998119991200012001120021200312004120051200612007120081200912010120111201212013120141201512016120171201812019120201202112022120231202412025120261202712028120291203012031120321203312034120351203612037120381203912040120411204212043120441204512046120471204812049120501205112052120531205412055120561205712058120591206012061120621206312064120651206612067120681206912070120711207212073120741207512076120771207812079120801208112082120831208412085120861208712088120891209012091120921209312094120951209612097120981209912100121011210212103121041210512106121071210812109121101211112112121131211412115121161211712118121191212012121121221212312124121251212612127121281212912130121311213212133121341213512136121371213812139121401214112142121431214412145121461214712148121491215012151121521215312154121551215612157121581215912160121611216212163121641216512166121671216812169121701217112172121731217412175121761217712178121791218012181121821218312184121851218612187121881218912190121911219212193121941219512196121971219812199122001220112202122031220412205122061220712208122091221012211122121221312214122151221612217122181221912220122211222212223122241222512226122271222812229122301223112232122331223412235122361223712238122391224012241122421224312244122451224612247122481224912250122511225212253122541225512256122571225812259122601226112262122631226412265122661226712268122691227012271122721227312274122751227612277122781227912280122811228212283122841228512286122871228812289122901229112292122931229412295122961229712298122991230012301123021230312304123051230612307123081230912310123111231212313123141231512316123171231812319123201232112322123231232412325123261232712328123291233012331123321233312334123351233612337123381233912340123411234212343123441234512346123471234812349123501235112352123531235412355123561235712358123591236012361123621236312364123651236612367123681236912370123711237212373123741237512376123771237812379123801238112382123831238412385123861238712388123891239012391123921239312394123951239612397123981239912400124011240212403124041240512406124071240812409124101241112412124131241412415124161241712418124191242012421124221242312424124251242612427124281242912430124311243212433124341243512436124371243812439124401244112442124431244412445124461244712448124491245012451124521245312454124551245612457124581245912460124611246212463124641246512466124671246812469124701247112472124731247412475124761247712478124791248012481124821248312484124851248612487124881248912490124911249212493124941249512496124971249812499125001250112502125031250412505125061250712508125091251012511125121251312514125151251612517125181251912520125211252212523125241252512526125271252812529125301253112532125331253412535125361253712538125391254012541125421254312544125451254612547125481254912550125511255212553125541255512556125571255812559125601256112562125631256412565125661256712568125691257012571125721257312574125751257612577125781257912580125811258212583125841258512586125871258812589125901259112592125931259412595125961259712598125991260012601126021260312604126051260612607126081260912610126111261212613126141261512616126171261812619126201262112622126231262412625126261262712628126291263012631126321263312634126351263612637126381263912640126411264212643126441264512646126471264812649126501265112652126531265412655126561265712658126591266012661126621266312664126651266612667126681266912670126711267212673126741267512676126771267812679126801268112682126831268412685126861268712688126891269012691126921269312694126951269612697126981269912700127011270212703127041270512706127071270812709127101271112712127131271412715127161271712718127191272012721127221272312724127251272612727127281272912730127311273212733127341273512736127371273812739127401274112742127431274412745127461274712748127491275012751127521275312754127551275612757127581275912760127611276212763127641276512766127671276812769127701277112772127731277412775127761277712778127791278012781127821278312784127851278612787127881278912790127911279212793127941279512796127971279812799128001280112802128031280412805128061280712808128091281012811128121281312814128151281612817128181281912820128211282212823128241282512826128271282812829128301283112832128331283412835128361283712838128391284012841128421284312844128451284612847128481284912850128511285212853128541285512856128571285812859128601286112862128631286412865128661286712868128691287012871128721287312874128751287612877128781287912880128811288212883128841288512886128871288812889128901289112892128931289412895128961289712898128991290012901129021290312904129051290612907129081290912910129111291212913129141291512916129171291812919129201292112922129231292412925129261292712928129291293012931129321293312934129351293612937129381293912940129411294212943129441294512946129471294812949129501295112952129531295412955129561295712958129591296012961129621296312964129651296612967129681296912970129711297212973129741297512976129771297812979129801298112982129831298412985129861298712988129891299012991129921299312994129951299612997129981299913000130011300213003130041300513006130071300813009130101301113012130131301413015130161301713018130191302013021130221302313024130251302613027130281302913030130311303213033130341303513036130371303813039130401304113042130431304413045130461304713048130491305013051130521305313054130551305613057130581305913060130611306213063130641306513066130671306813069130701307113072130731307413075130761307713078130791308013081130821308313084130851308613087130881308913090130911309213093130941309513096130971309813099131001310113102131031310413105131061310713108131091311013111131121311313114131151311613117131181311913120131211312213123131241312513126131271312813129131301313113132131331313413135131361313713138131391314013141131421314313144131451314613147131481314913150131511315213153131541315513156131571315813159131601316113162131631316413165131661316713168131691317013171131721317313174131751317613177131781317913180131811318213183131841318513186131871318813189131901319113192131931319413195131961319713198131991320013201132021320313204132051320613207132081320913210132111321213213132141321513216132171321813219132201322113222132231322413225132261322713228132291323013231132321323313234132351323613237132381323913240132411324213243132441324513246132471324813249132501325113252132531325413255132561325713258132591326013261132621326313264132651326613267132681326913270132711327213273132741327513276132771327813279132801328113282132831328413285132861328713288132891329013291132921329313294132951329613297132981329913300133011330213303133041330513306133071330813309133101331113312133131331413315133161331713318133191332013321133221332313324133251332613327133281332913330133311333213333133341333513336133371333813339133401334113342133431334413345133461334713348133491335013351133521335313354133551335613357133581335913360133611336213363133641336513366133671336813369133701337113372133731337413375133761337713378133791338013381133821338313384133851338613387133881338913390133911339213393133941339513396133971339813399134001340113402134031340413405134061340713408134091341013411134121341313414134151341613417134181341913420134211342213423134241342513426134271342813429134301343113432134331343413435134361343713438134391344013441134421344313444134451344613447134481344913450134511345213453134541345513456134571345813459134601346113462134631346413465134661346713468134691347013471134721347313474134751347613477134781347913480134811348213483134841348513486134871348813489134901349113492134931349413495134961349713498134991350013501135021350313504135051350613507135081350913510135111351213513135141351513516135171351813519135201352113522135231352413525135261352713528135291353013531135321353313534135351353613537135381353913540135411354213543135441354513546135471354813549135501355113552135531355413555135561355713558135591356013561135621356313564135651356613567135681356913570135711357213573135741357513576135771357813579135801358113582135831358413585135861358713588135891359013591135921359313594135951359613597135981359913600136011360213603136041360513606136071360813609136101361113612136131361413615136161361713618136191362013621136221362313624136251362613627136281362913630136311363213633136341363513636136371363813639136401364113642136431364413645136461364713648136491365013651136521365313654136551365613657136581365913660136611366213663136641366513666136671366813669136701367113672136731367413675136761367713678136791368013681136821368313684136851368613687136881368913690136911369213693136941369513696136971369813699137001370113702137031370413705137061370713708137091371013711137121371313714137151371613717137181371913720137211372213723137241372513726137271372813729137301373113732137331373413735137361373713738137391374013741137421374313744137451374613747137481374913750137511375213753137541375513756137571375813759137601376113762137631376413765137661376713768137691377013771137721377313774137751377613777137781377913780137811378213783137841378513786137871378813789137901379113792137931379413795137961379713798137991380013801138021380313804138051380613807138081380913810138111381213813138141381513816138171381813819138201382113822138231382413825138261382713828138291383013831138321383313834138351383613837138381383913840138411384213843138441384513846138471384813849138501385113852138531385413855138561385713858138591386013861138621386313864138651386613867138681386913870138711387213873138741387513876138771387813879138801388113882138831388413885138861388713888138891389013891138921389313894138951389613897138981389913900139011390213903139041390513906139071390813909139101391113912139131391413915139161391713918139191392013921139221392313924139251392613927139281392913930139311393213933139341393513936139371393813939139401394113942139431394413945139461394713948139491395013951139521395313954139551395613957139581395913960139611396213963139641396513966139671396813969139701397113972139731397413975139761397713978139791398013981139821398313984139851398613987139881398913990139911399213993139941399513996139971399813999140001400114002140031400414005140061400714008140091401014011140121401314014140151401614017140181401914020140211402214023140241402514026140271402814029140301403114032140331403414035140361403714038140391404014041140421404314044140451404614047140481404914050140511405214053140541405514056140571405814059140601406114062140631406414065140661406714068140691407014071140721407314074140751407614077140781407914080140811408214083140841408514086140871408814089140901409114092140931409414095140961409714098140991410014101141021410314104141051410614107141081410914110141111411214113141141411514116141171411814119141201412114122141231412414125141261412714128141291413014131141321413314134141351413614137141381413914140141411414214143141441414514146141471414814149141501415114152141531415414155141561415714158141591416014161141621416314164141651416614167141681416914170141711417214173141741417514176141771417814179141801418114182141831418414185141861418714188141891419014191141921419314194141951419614197141981419914200142011420214203142041420514206142071420814209142101421114212142131421414215142161421714218142191422014221142221422314224142251422614227142281422914230142311423214233142341423514236142371423814239142401424114242142431424414245142461424714248142491425014251142521425314254142551425614257142581425914260142611426214263142641426514266142671426814269142701427114272142731427414275142761427714278142791428014281142821428314284142851428614287142881428914290142911429214293142941429514296142971429814299143001430114302143031430414305143061430714308143091431014311143121431314314143151431614317143181431914320143211432214323143241432514326143271432814329143301433114332143331433414335143361433714338143391434014341143421434314344143451434614347143481434914350143511435214353143541435514356143571435814359143601436114362143631436414365143661436714368143691437014371143721437314374143751437614377143781437914380143811438214383143841438514386143871438814389143901439114392143931439414395143961439714398143991440014401144021440314404144051440614407144081440914410144111441214413144141441514416144171441814419144201442114422144231442414425144261442714428144291443014431144321443314434144351443614437144381443914440144411444214443144441444514446144471444814449144501445114452144531445414455144561445714458144591446014461144621446314464144651446614467144681446914470144711447214473144741447514476144771447814479144801448114482144831448414485144861448714488144891449014491144921449314494144951449614497144981449914500145011450214503145041450514506145071450814509145101451114512145131451414515145161451714518145191452014521145221452314524145251452614527145281452914530145311453214533145341453514536145371453814539145401454114542145431454414545145461454714548145491455014551145521455314554145551455614557145581455914560145611456214563145641456514566145671456814569145701457114572145731457414575145761457714578145791458014581145821458314584145851458614587145881458914590145911459214593145941459514596145971459814599146001460114602146031460414605146061460714608146091461014611146121461314614146151461614617146181461914620146211462214623146241462514626146271462814629146301463114632146331463414635146361463714638146391464014641146421464314644146451464614647146481464914650146511465214653146541465514656146571465814659146601466114662146631466414665146661466714668146691467014671146721467314674146751467614677146781467914680146811468214683146841468514686146871468814689146901469114692146931469414695146961469714698146991470014701147021470314704147051470614707147081470914710147111471214713147141471514716147171471814719147201472114722147231472414725147261472714728147291473014731147321473314734147351473614737147381473914740147411474214743147441474514746147471474814749147501475114752147531475414755147561475714758147591476014761147621476314764147651476614767147681476914770147711477214773147741477514776147771477814779147801478114782147831478414785147861478714788147891479014791147921479314794147951479614797147981479914800148011480214803148041480514806148071480814809148101481114812148131481414815148161481714818148191482014821148221482314824148251482614827148281482914830148311483214833148341483514836148371483814839148401484114842148431484414845148461484714848148491485014851148521485314854148551485614857148581485914860148611486214863148641486514866148671486814869148701487114872148731487414875148761487714878148791488014881148821488314884148851488614887148881488914890148911489214893148941489514896148971489814899149001490114902149031490414905149061490714908149091491014911149121491314914149151491614917149181491914920149211492214923149241492514926149271492814929149301493114932149331493414935149361493714938149391494014941149421494314944149451494614947149481494914950149511495214953149541495514956149571495814959149601496114962149631496414965149661496714968149691497014971149721497314974149751497614977149781497914980149811498214983149841498514986149871498814989149901499114992149931499414995149961499714998149991500015001150021500315004150051500615007150081500915010150111501215013150141501515016150171501815019150201502115022150231502415025150261502715028150291503015031150321503315034150351503615037150381503915040150411504215043150441504515046150471504815049150501505115052150531505415055150561505715058150591506015061150621506315064150651506615067150681506915070150711507215073150741507515076150771507815079150801508115082150831508415085150861508715088150891509015091150921509315094150951509615097150981509915100151011510215103151041510515106151071510815109151101511115112151131511415115151161511715118151191512015121151221512315124151251512615127151281512915130151311513215133151341513515136151371513815139151401514115142151431514415145151461514715148151491515015151151521515315154151551515615157151581515915160151611516215163151641516515166151671516815169151701517115172151731517415175151761517715178151791518015181151821518315184151851518615187151881518915190151911519215193151941519515196151971519815199152001520115202152031520415205152061520715208152091521015211152121521315214152151521615217152181521915220152211522215223152241522515226152271522815229152301523115232152331523415235152361523715238152391524015241152421524315244152451524615247152481524915250152511525215253152541525515256152571525815259152601526115262152631526415265152661526715268152691527015271152721527315274152751527615277152781527915280152811528215283152841528515286152871528815289152901529115292152931529415295152961529715298152991530015301153021530315304153051530615307153081530915310153111531215313153141531515316153171531815319153201532115322153231532415325153261532715328153291533015331153321533315334153351533615337153381533915340153411534215343153441534515346153471534815349153501535115352153531535415355153561535715358153591536015361153621536315364153651536615367153681536915370153711537215373153741537515376153771537815379153801538115382153831538415385153861538715388153891539015391153921539315394153951539615397153981539915400154011540215403154041540515406154071540815409154101541115412154131541415415154161541715418154191542015421154221542315424154251542615427154281542915430154311543215433154341543515436154371543815439154401544115442154431544415445154461544715448154491545015451154521545315454154551545615457154581545915460154611546215463154641546515466154671546815469154701547115472154731547415475154761547715478154791548015481154821548315484154851548615487154881548915490154911549215493154941549515496154971549815499155001550115502155031550415505155061550715508155091551015511155121551315514155151551615517155181551915520155211552215523155241552515526155271552815529155301553115532155331553415535155361553715538155391554015541155421554315544155451554615547155481554915550155511555215553155541555515556155571555815559155601556115562155631556415565155661556715568155691557015571155721557315574155751557615577155781557915580155811558215583155841558515586155871558815589155901559115592155931559415595155961559715598155991560015601156021560315604156051560615607156081560915610156111561215613156141561515616156171561815619156201562115622156231562415625156261562715628156291563015631156321563315634156351563615637156381563915640156411564215643156441564515646156471564815649156501565115652156531565415655156561565715658156591566015661156621566315664156651566615667156681566915670156711567215673156741567515676156771567815679156801568115682156831568415685156861568715688156891569015691156921569315694156951569615697156981569915700157011570215703157041570515706157071570815709157101571115712157131571415715157161571715718157191572015721157221572315724157251572615727157281572915730157311573215733157341573515736157371573815739157401574115742157431574415745157461574715748157491575015751157521575315754157551575615757157581575915760157611576215763157641576515766157671576815769157701577115772157731577415775157761577715778157791578015781157821578315784157851578615787157881578915790157911579215793157941579515796157971579815799158001580115802158031580415805158061580715808158091581015811158121581315814158151581615817158181581915820158211582215823158241582515826158271582815829158301583115832158331583415835158361583715838158391584015841158421584315844158451584615847158481584915850158511585215853158541585515856158571585815859158601586115862158631586415865158661586715868158691587015871158721587315874158751587615877158781587915880158811588215883158841588515886158871588815889158901589115892158931589415895158961589715898158991590015901159021590315904159051590615907159081590915910159111591215913159141591515916159171591815919159201592115922159231592415925159261592715928159291593015931159321593315934159351593615937159381593915940159411594215943159441594515946159471594815949159501595115952159531595415955159561595715958159591596015961159621596315964159651596615967159681596915970159711597215973159741597515976159771597815979159801598115982159831598415985159861598715988159891599015991159921599315994159951599615997159981599916000160011600216003160041600516006160071600816009160101601116012160131601416015160161601716018160191602016021160221602316024160251602616027160281602916030160311603216033160341603516036160371603816039160401604116042160431604416045160461604716048160491605016051160521605316054160551605616057160581605916060160611606216063160641606516066160671606816069160701607116072160731607416075160761607716078160791608016081160821608316084160851608616087160881608916090160911609216093160941609516096160971609816099161001610116102161031610416105161061610716108161091611016111161121611316114161151611616117161181611916120161211612216123161241612516126161271612816129161301613116132161331613416135161361613716138161391614016141161421614316144161451614616147161481614916150161511615216153161541615516156161571615816159161601616116162161631616416165161661616716168161691617016171161721617316174161751617616177161781617916180161811618216183161841618516186161871618816189161901619116192161931619416195161961619716198161991620016201162021620316204162051620616207162081620916210162111621216213162141621516216162171621816219162201622116222162231622416225162261622716228162291623016231162321623316234162351623616237162381623916240162411624216243162441624516246162471624816249162501625116252162531625416255162561625716258162591626016261162621626316264162651626616267162681626916270162711627216273162741627516276162771627816279162801628116282162831628416285162861628716288162891629016291162921629316294162951629616297162981629916300163011630216303163041630516306163071630816309163101631116312163131631416315163161631716318163191632016321163221632316324163251632616327163281632916330163311633216333163341633516336163371633816339163401634116342163431634416345163461634716348163491635016351163521635316354163551635616357163581635916360163611636216363163641636516366163671636816369163701637116372163731637416375163761637716378163791638016381163821638316384163851638616387163881638916390163911639216393163941639516396163971639816399164001640116402164031640416405164061640716408164091641016411164121641316414164151641616417164181641916420164211642216423164241642516426164271642816429164301643116432164331643416435164361643716438164391644016441164421644316444164451644616447164481644916450164511645216453164541645516456164571645816459164601646116462164631646416465164661646716468164691647016471164721647316474164751647616477164781647916480164811648216483164841648516486164871648816489164901649116492164931649416495164961649716498164991650016501165021650316504165051650616507165081650916510165111651216513165141651516516165171651816519165201652116522165231652416525165261652716528165291653016531165321653316534165351653616537165381653916540165411654216543165441654516546165471654816549165501655116552165531655416555165561655716558165591656016561165621656316564165651656616567165681656916570165711657216573165741657516576165771657816579165801658116582165831658416585165861658716588165891659016591165921659316594165951659616597165981659916600166011660216603166041660516606166071660816609166101661116612166131661416615166161661716618166191662016621166221662316624166251662616627166281662916630166311663216633166341663516636166371663816639166401664116642166431664416645166461664716648166491665016651166521665316654166551665616657166581665916660166611666216663166641666516666166671666816669166701667116672166731667416675166761667716678166791668016681166821668316684166851668616687166881668916690166911669216693166941669516696166971669816699167001670116702167031670416705167061670716708167091671016711167121671316714167151671616717167181671916720167211672216723167241672516726167271672816729167301673116732167331673416735167361673716738167391674016741167421674316744167451674616747167481674916750167511675216753167541675516756167571675816759167601676116762167631676416765167661676716768167691677016771167721677316774167751677616777167781677916780167811678216783167841678516786167871678816789167901679116792167931679416795167961679716798167991680016801168021680316804168051680616807168081680916810168111681216813168141681516816168171681816819168201682116822168231682416825168261682716828168291683016831168321683316834168351683616837168381683916840168411684216843168441684516846168471684816849168501685116852168531685416855168561685716858168591686016861168621686316864168651686616867168681686916870168711687216873168741687516876168771687816879168801688116882168831688416885168861688716888168891689016891168921689316894168951689616897168981689916900169011690216903169041690516906169071690816909169101691116912169131691416915169161691716918169191692016921169221692316924169251692616927169281692916930169311693216933169341693516936169371693816939169401694116942169431694416945169461694716948169491695016951169521695316954169551695616957169581695916960169611696216963169641696516966169671696816969169701697116972169731697416975169761697716978169791698016981169821698316984169851698616987169881698916990169911699216993169941699516996169971699816999170001700117002170031700417005170061700717008170091701017011170121701317014170151701617017170181701917020170211702217023170241702517026170271702817029170301703117032170331703417035170361703717038170391704017041170421704317044170451704617047170481704917050170511705217053170541705517056170571705817059170601706117062170631706417065170661706717068170691707017071170721707317074170751707617077170781707917080170811708217083170841708517086170871708817089170901709117092170931709417095170961709717098170991710017101171021710317104171051710617107171081710917110171111711217113171141711517116171171711817119171201712117122171231712417125171261712717128171291713017131171321713317134171351713617137171381713917140171411714217143171441714517146171471714817149171501715117152171531715417155171561715717158171591716017161171621716317164171651716617167171681716917170171711717217173171741717517176171771717817179171801718117182171831718417185171861718717188171891719017191171921719317194171951719617197171981719917200172011720217203172041720517206172071720817209172101721117212172131721417215172161721717218172191722017221172221722317224172251722617227172281722917230172311723217233172341723517236172371723817239172401724117242172431724417245172461724717248172491725017251172521725317254172551725617257172581725917260172611726217263172641726517266172671726817269172701727117272172731727417275172761727717278172791728017281172821728317284172851728617287172881728917290172911729217293172941729517296172971729817299173001730117302173031730417305173061730717308173091731017311173121731317314173151731617317173181731917320173211732217323173241732517326173271732817329173301733117332173331733417335173361733717338173391734017341173421734317344173451734617347173481734917350173511735217353173541735517356173571735817359173601736117362173631736417365173661736717368173691737017371173721737317374173751737617377173781737917380173811738217383173841738517386173871738817389173901739117392173931739417395173961739717398173991740017401174021740317404174051740617407174081740917410174111741217413174141741517416174171741817419174201742117422174231742417425174261742717428174291743017431174321743317434174351743617437174381743917440174411744217443174441744517446174471744817449174501745117452174531745417455174561745717458174591746017461174621746317464174651746617467174681746917470174711747217473174741747517476174771747817479174801748117482174831748417485174861748717488174891749017491174921749317494174951749617497174981749917500175011750217503175041750517506175071750817509175101751117512175131751417515175161751717518175191752017521175221752317524175251752617527175281752917530175311753217533175341753517536175371753817539175401754117542175431754417545175461754717548175491755017551175521755317554175551755617557175581755917560175611756217563175641756517566175671756817569175701757117572175731757417575175761757717578175791758017581175821758317584175851758617587175881758917590175911759217593175941759517596175971759817599176001760117602176031760417605176061760717608176091761017611176121761317614176151761617617176181761917620176211762217623176241762517626176271762817629176301763117632176331763417635176361763717638176391764017641176421764317644176451764617647176481764917650176511765217653176541765517656176571765817659176601766117662176631766417665176661766717668176691767017671176721767317674176751767617677176781767917680176811768217683176841768517686176871768817689176901769117692176931769417695176961769717698176991770017701177021770317704177051770617707177081770917710177111771217713177141771517716177171771817719177201772117722177231772417725177261772717728177291773017731177321773317734177351773617737177381773917740177411774217743177441774517746177471774817749177501775117752177531775417755177561775717758177591776017761177621776317764177651776617767177681776917770177711777217773177741777517776177771777817779177801778117782177831778417785177861778717788177891779017791177921779317794177951779617797177981779917800178011780217803178041780517806178071780817809178101781117812178131781417815178161781717818178191782017821178221782317824178251782617827178281782917830178311783217833178341783517836178371783817839178401784117842178431784417845178461784717848178491785017851178521785317854178551785617857178581785917860178611786217863178641786517866178671786817869178701787117872178731787417875178761787717878178791788017881178821788317884178851788617887178881788917890178911789217893178941789517896178971789817899179001790117902179031790417905179061790717908179091791017911179121791317914179151791617917179181791917920179211792217923179241792517926179271792817929179301793117932179331793417935179361793717938179391794017941179421794317944179451794617947179481794917950179511795217953179541795517956179571795817959179601796117962179631796417965179661796717968179691797017971179721797317974179751797617977179781797917980179811798217983179841798517986179871798817989179901799117992179931799417995179961799717998179991800018001180021800318004180051800618007180081800918010180111801218013180141801518016180171801818019180201802118022180231802418025180261802718028180291803018031180321803318034180351803618037180381803918040180411804218043180441804518046180471804818049180501805118052180531805418055180561805718058180591806018061180621806318064180651806618067180681806918070180711807218073180741807518076180771807818079180801808118082180831808418085180861808718088180891809018091180921809318094180951809618097180981809918100181011810218103181041810518106181071810818109181101811118112181131811418115181161811718118181191812018121181221812318124181251812618127181281812918130181311813218133181341813518136181371813818139181401814118142181431814418145181461814718148181491815018151181521815318154181551815618157181581815918160181611816218163181641816518166181671816818169181701817118172181731817418175181761817718178181791818018181181821818318184181851818618187181881818918190181911819218193181941819518196181971819818199182001820118202182031820418205182061820718208182091821018211182121821318214182151821618217182181821918220182211822218223182241822518226182271822818229182301823118232182331823418235182361823718238182391824018241182421824318244182451824618247182481824918250182511825218253182541825518256182571825818259182601826118262182631826418265182661826718268182691827018271182721827318274182751827618277182781827918280182811828218283182841828518286182871828818289182901829118292182931829418295182961829718298182991830018301183021830318304183051830618307183081830918310183111831218313183141831518316183171831818319183201832118322183231832418325183261832718328183291833018331183321833318334183351833618337183381833918340183411834218343183441834518346183471834818349183501835118352183531835418355183561835718358183591836018361183621836318364183651836618367183681836918370183711837218373183741837518376183771837818379183801838118382183831838418385183861838718388183891839018391183921839318394183951839618397183981839918400184011840218403184041840518406184071840818409184101841118412184131841418415184161841718418184191842018421184221842318424184251842618427184281842918430184311843218433184341843518436184371843818439184401844118442184431844418445184461844718448184491845018451184521845318454184551845618457184581845918460184611846218463184641846518466184671846818469184701847118472184731847418475184761847718478184791848018481184821848318484184851848618487184881848918490184911849218493184941849518496184971849818499185001850118502185031850418505185061850718508185091851018511185121851318514185151851618517185181851918520185211852218523185241852518526185271852818529185301853118532185331853418535185361853718538185391854018541185421854318544185451854618547185481854918550185511855218553185541855518556185571855818559185601856118562185631856418565185661856718568185691857018571185721857318574185751857618577185781857918580185811858218583185841858518586185871858818589185901859118592185931859418595185961859718598185991860018601186021860318604186051860618607186081860918610186111861218613186141861518616186171861818619186201862118622186231862418625186261862718628186291863018631186321863318634186351863618637186381863918640186411864218643186441864518646186471864818649186501865118652186531865418655186561865718658186591866018661186621866318664186651866618667186681866918670186711867218673186741867518676186771867818679186801868118682186831868418685186861868718688186891869018691186921869318694186951869618697186981869918700187011870218703187041870518706187071870818709187101871118712187131871418715187161871718718187191872018721187221872318724187251872618727187281872918730187311873218733187341873518736187371873818739187401874118742187431874418745187461874718748187491875018751187521875318754187551875618757187581875918760187611876218763187641876518766187671876818769187701877118772187731877418775187761877718778187791878018781187821878318784187851878618787187881878918790187911879218793187941879518796187971879818799188001880118802188031880418805188061880718808188091881018811188121881318814188151881618817188181881918820188211882218823188241882518826188271882818829188301883118832188331883418835188361883718838188391884018841188421884318844188451884618847188481884918850188511885218853188541885518856188571885818859188601886118862188631886418865188661886718868188691887018871188721887318874188751887618877188781887918880188811888218883188841888518886188871888818889188901889118892188931889418895188961889718898188991890018901189021890318904189051890618907189081890918910189111891218913189141891518916189171891818919189201892118922189231892418925189261892718928189291893018931189321893318934189351893618937189381893918940189411894218943189441894518946189471894818949189501895118952189531895418955189561895718958189591896018961189621896318964189651896618967189681896918970189711897218973189741897518976189771897818979189801898118982189831898418985189861898718988189891899018991189921899318994189951899618997189981899919000190011900219003190041900519006190071900819009190101901119012190131901419015190161901719018190191902019021190221902319024190251902619027190281902919030190311903219033190341903519036190371903819039190401904119042190431904419045190461904719048190491905019051190521905319054190551905619057190581905919060190611906219063190641906519066190671906819069190701907119072190731907419075190761907719078190791908019081190821908319084190851908619087190881908919090190911909219093190941909519096190971909819099191001910119102191031910419105191061910719108191091911019111191121911319114191151911619117191181911919120191211912219123191241912519126191271912819129191301913119132191331913419135191361913719138191391914019141191421914319144191451914619147191481914919150191511915219153191541915519156191571915819159191601916119162191631916419165191661916719168191691917019171191721917319174191751917619177191781917919180191811918219183191841918519186191871918819189191901919119192191931919419195191961919719198191991920019201192021920319204192051920619207192081920919210192111921219213192141921519216192171921819219192201922119222192231922419225192261922719228192291923019231192321923319234192351923619237192381923919240192411924219243192441924519246192471924819249192501925119252192531925419255192561925719258192591926019261192621926319264192651926619267192681926919270192711927219273192741927519276192771927819279192801928119282192831928419285192861928719288192891929019291192921929319294192951929619297192981929919300193011930219303193041930519306193071930819309193101931119312193131931419315193161931719318193191932019321193221932319324193251932619327193281932919330193311933219333193341933519336193371933819339193401934119342193431934419345193461934719348193491935019351193521935319354193551935619357193581935919360193611936219363193641936519366193671936819369193701937119372193731937419375193761937719378193791938019381193821938319384193851938619387193881938919390193911939219393193941939519396193971939819399
  1. #include "llama-model.h"
  2. #include "llama-impl.h"
  3. #include "llama-mmap.h"
  4. #include "llama-batch.h"
  5. #include "llama-cparams.h"
  6. #include "llama-model-loader.h"
  7. #include "llama-kv-cache.h"
  8. #include "llama-kv-cache-iswa.h"
  9. #include "llama-memory-hybrid.h"
  10. #include "llama-memory-recurrent.h"
  11. #include "ggml-cpp.h"
  12. #include <algorithm>
  13. #include <cassert>
  14. #include <cmath>
  15. #include <cfloat>
  16. #include <cstring>
  17. #include <cmath>
  18. #include <functional>
  19. #include <map>
  20. #include <regex>
  21. #include <sstream>
  22. #include <stdexcept>
  23. const char * llm_type_name(llm_type type) {
  24. switch (type) {
  25. case LLM_TYPE_14M: return "14M";
  26. case LLM_TYPE_17M: return "17M";
  27. case LLM_TYPE_22M: return "22M";
  28. case LLM_TYPE_33M: return "33M";
  29. case LLM_TYPE_60M: return "60M";
  30. case LLM_TYPE_70M: return "70M";
  31. case LLM_TYPE_80M: return "80M";
  32. case LLM_TYPE_109M: return "109M";
  33. case LLM_TYPE_137M: return "137M";
  34. case LLM_TYPE_160M: return "160M";
  35. case LLM_TYPE_190M: return "190M";
  36. case LLM_TYPE_220M: return "220M";
  37. case LLM_TYPE_250M: return "250M";
  38. case LLM_TYPE_256M: return "256M";
  39. case LLM_TYPE_270M: return "270M";
  40. case LLM_TYPE_335M: return "335M";
  41. case LLM_TYPE_350M: return "350M";
  42. case LLM_TYPE_410M: return "410M";
  43. case LLM_TYPE_450M: return "450M";
  44. case LLM_TYPE_475M: return "475M";
  45. case LLM_TYPE_558M: return "558M";
  46. case LLM_TYPE_700M: return "700M";
  47. case LLM_TYPE_770M: return "770M";
  48. case LLM_TYPE_780M: return "780M";
  49. case LLM_TYPE_0_3B: return "0.3B";
  50. case LLM_TYPE_0_5B: return "0.5B";
  51. case LLM_TYPE_0_6B: return "0.6B";
  52. case LLM_TYPE_1B: return "1B";
  53. case LLM_TYPE_1_2B: return "1.2B";
  54. case LLM_TYPE_1_3B: return "1.3B";
  55. case LLM_TYPE_1_4B: return "1.4B";
  56. case LLM_TYPE_1_5B: return "1.5B";
  57. case LLM_TYPE_1_6B: return "1.6B";
  58. case LLM_TYPE_1_7B: return "1.7B";
  59. case LLM_TYPE_1_8B: return "1.8B";
  60. case LLM_TYPE_2B: return "2B";
  61. case LLM_TYPE_2_8B: return "2.8B";
  62. case LLM_TYPE_2_9B: return "2.9B";
  63. case LLM_TYPE_3B: return "3B";
  64. case LLM_TYPE_4B: return "4B";
  65. case LLM_TYPE_6B: return "6B";
  66. case LLM_TYPE_6_9B: return "6.9B";
  67. case LLM_TYPE_7B: return "7B";
  68. case LLM_TYPE_8B: return "8B";
  69. case LLM_TYPE_9B: return "9B";
  70. case LLM_TYPE_11B: return "11B";
  71. case LLM_TYPE_12B: return "12B";
  72. case LLM_TYPE_13B: return "13B";
  73. case LLM_TYPE_14B: return "14B";
  74. case LLM_TYPE_15B: return "15B";
  75. case LLM_TYPE_16B: return "16B";
  76. case LLM_TYPE_20B: return "20B";
  77. case LLM_TYPE_27B: return "27B";
  78. case LLM_TYPE_30B: return "30B";
  79. case LLM_TYPE_32B: return "32B";
  80. case LLM_TYPE_34B: return "34B";
  81. case LLM_TYPE_35B: return "35B";
  82. case LLM_TYPE_36B: return "36B";
  83. case LLM_TYPE_40B: return "40B";
  84. case LLM_TYPE_65B: return "65B";
  85. case LLM_TYPE_70B: return "70B";
  86. case LLM_TYPE_120B: return "120B";
  87. case LLM_TYPE_142B: return "142B";
  88. case LLM_TYPE_236B: return "236B";
  89. case LLM_TYPE_290B: return "290B";
  90. case LLM_TYPE_314B: return "314B";
  91. case LLM_TYPE_405B: return "405B";
  92. case LLM_TYPE_671B: return "671B";
  93. case LLM_TYPE_SMALL: return "0.1B";
  94. case LLM_TYPE_MEDIUM: return "0.4B";
  95. case LLM_TYPE_LARGE: return "0.8B";
  96. case LLM_TYPE_XL: return "1.5B";
  97. case LLM_TYPE_A1_7B: return "A1.7B";
  98. case LLM_TYPE_A2_7B: return "A2.7B";
  99. case LLM_TYPE_8x7B: return "8x7B";
  100. case LLM_TYPE_8x22B: return "8x22B";
  101. case LLM_TYPE_16x12B: return "16x12B";
  102. case LLM_TYPE_16x3_8B: return "16x3.8B";
  103. case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
  104. case LLM_TYPE_57B_A14B: return "57B.A14B";
  105. case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
  106. case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
  107. case LLM_TYPE_A13B: return "A13B";
  108. case LLM_TYPE_21B_A3B: return "21B.A3B";
  109. case LLM_TYPE_30B_A3B: return "30B.A3B";
  110. case LLM_TYPE_106B_A12B: return "106B.A12B";
  111. case LLM_TYPE_235B_A22B: return "235B.A22B";
  112. case LLM_TYPE_300B_A47B: return "300B.A47B";
  113. case LLM_TYPE_355B_A32B: return "355B.A32B";
  114. case LLM_TYPE_E2B: return "E2B";
  115. case LLM_TYPE_E4B: return "E4B";
  116. default: return "?B";
  117. }
  118. }
  119. static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
  120. switch (type) {
  121. case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
  122. case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
  123. default: return "unknown";
  124. }
  125. }
  126. static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
  127. { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
  128. { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
  129. { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
  130. { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
  131. };
  132. std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
  133. return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
  134. }
  135. static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
  136. for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
  137. if (kv.second == name) {
  138. return (llama_rope_scaling_type) kv.first;
  139. }
  140. }
  141. return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
  142. }
  143. // checks if the weight tensor can be used with the specified buffer type and device
  144. static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
  145. GGML_ASSERT(w != nullptr);
  146. if (op == GGML_OP_NONE) {
  147. return true;
  148. }
  149. ggml_init_params params = {
  150. /*.mem_size =*/ ggml_tensor_overhead()*8,
  151. /*.mem_buffer =*/ NULL,
  152. /*.no_alloc =*/ true,
  153. };
  154. ggml_context_ptr ctx_ptr { ggml_init(params) };
  155. if (!ctx_ptr) {
  156. throw std::runtime_error(format("failed to create ggml context"));
  157. }
  158. ggml_context * ctx = ctx_ptr.get();
  159. ggml_tensor * op_tensor = nullptr;
  160. switch (op) {
  161. case GGML_OP_GET_ROWS:
  162. {
  163. ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
  164. op_tensor = ggml_get_rows(ctx, w, b);
  165. } break;
  166. case GGML_OP_MUL_MAT:
  167. {
  168. ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
  169. op_tensor = ggml_mul_mat(ctx, w, b);
  170. } break;
  171. case GGML_OP_MUL_MAT_ID:
  172. {
  173. int n_expert_used = hparams.n_expert_used;
  174. ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
  175. ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
  176. op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
  177. } break;
  178. case GGML_OP_ADD:
  179. {
  180. ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
  181. op_tensor = ggml_add(ctx, a, w);
  182. } break;
  183. case GGML_OP_ADD_ID:
  184. {
  185. int n_expert_used = hparams.n_expert_used;
  186. ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
  187. ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
  188. op_tensor = ggml_add_id(ctx, a, w, c);
  189. } break;
  190. case GGML_OP_MUL:
  191. {
  192. ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
  193. op_tensor = ggml_mul(ctx, a, w);
  194. } break;
  195. case GGML_OP_DIV:
  196. {
  197. ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
  198. op_tensor = ggml_div(ctx, a, w);
  199. } break;
  200. case GGML_OP_ROPE:
  201. {
  202. int n_embd_head = hparams.n_embd_head_v;
  203. int n_head = hparams.n_head();
  204. ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
  205. ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
  206. op_tensor = ggml_rope_ext(
  207. ctx, a, b, w,
  208. 0, 0, 0, 0, 0,
  209. 0, 0, 0, 0
  210. );
  211. } break;
  212. case GGML_OP_SSM_CONV:
  213. {
  214. const int64_t n_seq_tokens = 512;
  215. const int64_t n_seqs = 3;
  216. ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
  217. op_tensor = ggml_ssm_conv(ctx, conv_x, w);
  218. } break;
  219. case GGML_OP_SSM_SCAN:
  220. {
  221. // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
  222. const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
  223. const int64_t n_head = w->ne[1];
  224. const int64_t head_dim = hparams.ssm_d_inner / n_head;
  225. const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
  226. const int64_t n_seq_tokens = 512;
  227. const int64_t n_seqs = 3;
  228. ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
  229. ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
  230. ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
  231. ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
  232. ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
  233. ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
  234. op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
  235. } break;
  236. case GGML_OP_RWKV_WKV6:
  237. {
  238. // FIXME
  239. const int64_t S = 123;
  240. const int64_t H = 123;
  241. const int64_t n_tokens = 123;
  242. const int64_t n_seqs = 123;
  243. ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  244. ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  245. ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  246. ggml_tensor * tf = w;
  247. ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  248. ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
  249. op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
  250. } break;
  251. case GGML_OP_IM2COL:
  252. {
  253. const int n_embd = hparams.n_embd;
  254. ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
  255. op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
  256. } break;
  257. case GGML_OP_SCALE:
  258. {
  259. op_tensor = ggml_scale(ctx, w, 1.0f);
  260. } break;
  261. default:
  262. GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
  263. }
  264. // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
  265. GGML_ASSERT(w->buffer == nullptr);
  266. w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
  267. bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  268. ggml_backend_buffer_free(w->buffer);
  269. w->buffer = nullptr;
  270. return op_supported;
  271. }
  272. // lists of buffer types used for each layer
  273. using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
  274. // find the first buffer type in the list that can use the tensor
  275. static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
  276. GGML_ASSERT(!buft_list.empty());
  277. for (const auto & cur : buft_list) {
  278. ggml_backend_dev_t cur_dev = cur.first;
  279. ggml_backend_buffer_type_t cur_buft = cur.second;
  280. if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
  281. return cur_buft;
  282. }
  283. }
  284. return nullptr;
  285. }
  286. // CPU: ACCEL -> GPU host -> CPU extra -> CPU
  287. static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts) {
  288. buft_list_t buft_list;
  289. // add ACCEL buffer types
  290. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  291. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  292. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
  293. auto * buft = ggml_backend_dev_buffer_type(dev);
  294. // skip
  295. if (buft != ggml_backend_cpu_buffer_type()) {
  296. buft_list.emplace_back(dev, buft);
  297. }
  298. }
  299. }
  300. // add a host buffer type
  301. // storing the tensors in a host buffer is useful when the processing of large batches
  302. // is offloaded to a GPU device, since it reduces the time spent on data transfers
  303. // generally, this will be done using the first device in the list
  304. // a better approach would be to handle this on a weight-by-weight basis using the offload_op
  305. // function of the device to determine if it would benefit from being stored in a host buffer
  306. for (auto * dev : devices) {
  307. ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
  308. if (buft) {
  309. buft_list.emplace_back(dev, buft);
  310. break;
  311. }
  312. }
  313. // add extra buffer types
  314. if (use_extra_bufts) {
  315. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  316. if (cpu_dev == nullptr) {
  317. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  318. }
  319. auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
  320. auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
  321. ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
  322. if (ggml_backend_dev_get_extra_bufts_fn) {
  323. ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
  324. while (extra_bufts && *extra_bufts) {
  325. buft_list.emplace_back(cpu_dev, *extra_bufts);
  326. ++extra_bufts;
  327. }
  328. }
  329. }
  330. // add the CPU buffer type
  331. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  332. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  333. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
  334. buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
  335. }
  336. }
  337. return buft_list;
  338. }
  339. // GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
  340. static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode split_mode, const float * tensor_split) {
  341. buft_list_t buft_list;
  342. // add the device split buffer type if requested and available
  343. if (split_mode == LLAMA_SPLIT_MODE_ROW) {
  344. ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
  345. auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
  346. ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
  347. if (ggml_backend_split_buffer_type_fn) {
  348. size_t dev_index = [&]() {
  349. auto * reg = ggml_backend_dev_backend_reg(dev);
  350. for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
  351. if (ggml_backend_reg_dev_get(reg, i) == dev) {
  352. return i;
  353. }
  354. }
  355. throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
  356. }();
  357. auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
  358. if (buft != nullptr) {
  359. buft_list.emplace_back(dev, buft);
  360. }
  361. }
  362. }
  363. // add the device default buffer type
  364. buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
  365. return buft_list;
  366. }
  367. struct llama_model::impl {
  368. impl() {}
  369. ~impl() {}
  370. uint64_t n_elements = 0;
  371. size_t n_bytes = 0;
  372. std::string desc_str;
  373. // model memory mapped files
  374. llama_mmaps mappings;
  375. // objects representing data potentially being locked in memory
  376. llama_mlocks mlock_bufs;
  377. llama_mlocks mlock_mmaps;
  378. // contexts where the model tensors metadata is stored
  379. std::vector<ggml_context_ptr> ctxs;
  380. // the model memory buffers for the tensor data
  381. std::vector<ggml_backend_buffer_ptr> bufs;
  382. buft_list_t cpu_buft_list;
  383. std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
  384. struct layer_dev {
  385. ggml_backend_dev_t dev;
  386. buft_list_t * buft_list;
  387. };
  388. layer_dev dev_input = {};
  389. layer_dev dev_output = {};
  390. std::vector<layer_dev> dev_layer;
  391. bool has_tensor_overrides;
  392. };
  393. llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
  394. pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
  395. }
  396. llama_model::~llama_model() {}
  397. void llama_model::load_stats(llama_model_loader & ml) {
  398. pimpl->n_elements = ml.n_elements;
  399. pimpl->n_bytes = ml.n_bytes;
  400. }
  401. void llama_model::load_arch(llama_model_loader & ml) {
  402. arch = ml.get_arch();
  403. if (arch == LLM_ARCH_UNKNOWN) {
  404. throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
  405. }
  406. }
  407. void llama_model::load_hparams(llama_model_loader & ml) {
  408. const gguf_context * ctx = ml.meta.get();
  409. // get metadata as string
  410. for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
  411. gguf_type type = gguf_get_kv_type(ctx, i);
  412. if (type == GGUF_TYPE_ARRAY) {
  413. continue;
  414. }
  415. const char * name = gguf_get_key(ctx, i);
  416. const std::string value = gguf_kv_to_str(ctx, i);
  417. gguf_kv.emplace(name, value);
  418. }
  419. // get general kv
  420. ml.get_key(LLM_KV_GENERAL_NAME, name, false);
  421. // everything past this point is not vocab-related
  422. if (hparams.vocab_only) {
  423. return;
  424. }
  425. ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
  426. ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
  427. ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
  428. ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
  429. ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
  430. if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
  431. ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
  432. ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
  433. ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
  434. ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
  435. ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
  436. }
  437. GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
  438. GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
  439. if (hparams.n_expert > 0) {
  440. GGML_ASSERT(hparams.n_expert_used > 0);
  441. } else {
  442. GGML_ASSERT(hparams.n_expert_used == 0);
  443. }
  444. std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
  445. std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
  446. std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
  447. std::fill(
  448. hparams.recurrent_layer_arr.begin(),
  449. hparams.recurrent_layer_arr.end(),
  450. llm_arch_is_recurrent(ml.get_arch()));
  451. std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
  452. std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
  453. ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
  454. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
  455. // n_head_kv is optional, default to n_head
  456. hparams.n_head_kv_arr = hparams.n_head_arr;
  457. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
  458. bool rope_finetuned = false;
  459. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  460. hparams.rope_finetuned = rope_finetuned;
  461. hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
  462. ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
  463. // rope_freq_base (optional)
  464. hparams.rope_freq_base_train = 10000.0f;
  465. ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
  466. std::string rope_scaling("linear");
  467. ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
  468. hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
  469. GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
  470. // rope_freq_scale (inverse of the kv) is optional
  471. float ropescale = 0.0f;
  472. if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
  473. // try the old key name
  474. ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
  475. }
  476. hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
  477. // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
  478. hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
  479. hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
  480. ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
  481. // non-transformer models do not have attention heads
  482. if (hparams.n_head() > 0) {
  483. // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
  484. // gpt-j n_rot = rotary_dim
  485. hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
  486. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
  487. hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
  488. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
  489. // sanity check for n_rot (optional)
  490. hparams.n_rot = hparams.n_embd_head_k;
  491. ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
  492. if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
  493. if (hparams.n_rot != hparams.n_embd_head_k) {
  494. throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
  495. }
  496. }
  497. } else {
  498. hparams.n_rot = 0;
  499. hparams.n_embd_head_k = 0;
  500. hparams.n_embd_head_v = 0;
  501. }
  502. // for differentiating model types
  503. uint32_t n_vocab = 0;
  504. ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
  505. // for classifier models
  506. ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
  507. if (!classifier_labels.empty()) {
  508. hparams.n_cls_out = classifier_labels.size();
  509. }
  510. // arch-specific KVs
  511. switch (arch) {
  512. case LLM_ARCH_LLAMA:
  513. {
  514. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  515. if (hparams.n_expert == 8) {
  516. switch (hparams.n_layer) {
  517. case 32: type = LLM_TYPE_8x7B; break;
  518. case 56: type = LLM_TYPE_8x22B; break;
  519. default: type = LLM_TYPE_UNKNOWN;
  520. }
  521. } else {
  522. switch (hparams.n_layer) {
  523. case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
  524. case 22: type = LLM_TYPE_1B; break;
  525. case 26: type = LLM_TYPE_3B; break;
  526. case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
  527. case 30: type = LLM_TYPE_256M; break; // smoldocling 256M
  528. // granite uses a vocab with len 49152
  529. case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
  530. case 36: type = LLM_TYPE_8B; break; // granite
  531. case 40: type = LLM_TYPE_13B; break;
  532. case 48: type = LLM_TYPE_34B; break;
  533. case 60: type = LLM_TYPE_30B; break;
  534. case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
  535. default: type = LLM_TYPE_UNKNOWN;
  536. }
  537. }
  538. } break;
  539. case LLM_ARCH_LLAMA4:
  540. {
  541. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  542. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  543. ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
  544. hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
  545. hparams.n_swa = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
  546. hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
  547. switch (hparams.n_expert) {
  548. case 16: type = LLM_TYPE_17B_16E; break;
  549. case 128: type = LLM_TYPE_17B_128E; break;
  550. default: type = LLM_TYPE_UNKNOWN;
  551. }
  552. if (type == LLM_TYPE_17B_128E) {
  553. hparams.use_kq_norm = false;
  554. }
  555. } break;
  556. case LLM_ARCH_ARCEE:
  557. {
  558. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  559. // Arcee uses the same structure as Llama
  560. switch (hparams.n_layer) {
  561. case 36: type = LLM_TYPE_4B; break;
  562. default: type = LLM_TYPE_UNKNOWN;
  563. }
  564. } break;
  565. case LLM_ARCH_DECI:
  566. {
  567. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  568. switch (hparams.n_layer) {
  569. case 32: type = LLM_TYPE_7B; break;
  570. case 80: type = LLM_TYPE_70B; break;
  571. case 162: type = LLM_TYPE_405B; break;
  572. default: type = LLM_TYPE_UNKNOWN;
  573. }
  574. } break;
  575. case LLM_ARCH_MINICPM:
  576. {
  577. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  578. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  579. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  580. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  581. // MiniCPM uses rope by default, unlike Granite which uses it as a switch
  582. hparams.rope_finetuned = true;
  583. switch (hparams.n_layer) {
  584. case 52: type = LLM_TYPE_1B; break;
  585. case 40: type = LLM_TYPE_2B; break;
  586. default: type = LLM_TYPE_UNKNOWN;
  587. }
  588. } break;
  589. case LLM_ARCH_MINICPM3:
  590. {
  591. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  592. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  593. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  594. switch (hparams.n_layer) {
  595. case 62: type = LLM_TYPE_4B; break;
  596. default: type = LLM_TYPE_UNKNOWN;
  597. }
  598. } break;
  599. case LLM_ARCH_GROK:
  600. {
  601. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  602. switch (hparams.n_layer) {
  603. case 64: type = LLM_TYPE_314B; break;
  604. default: type = LLM_TYPE_UNKNOWN;
  605. }
  606. } break;
  607. case LLM_ARCH_FALCON:
  608. {
  609. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  610. switch (hparams.n_layer) {
  611. case 32: type = LLM_TYPE_7B; break;
  612. case 60: type = LLM_TYPE_40B; break;
  613. default: type = LLM_TYPE_UNKNOWN;
  614. }
  615. } break;
  616. case LLM_ARCH_BAICHUAN:
  617. {
  618. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  619. switch (hparams.n_layer) {
  620. case 32: type = LLM_TYPE_7B; break;
  621. case 40: type = LLM_TYPE_13B; break;
  622. default: type = LLM_TYPE_UNKNOWN;
  623. }
  624. if (type == LLM_TYPE_13B) {
  625. // TODO: become GGUF KV parameter
  626. hparams.f_max_alibi_bias = 8.0f;
  627. }
  628. } break;
  629. case LLM_ARCH_STARCODER:
  630. {
  631. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  632. switch (hparams.n_layer) {
  633. case 24: type = LLM_TYPE_1B; break;
  634. case 36: type = LLM_TYPE_3B; break;
  635. case 42: type = LLM_TYPE_7B; break;
  636. case 40: type = LLM_TYPE_15B; break;
  637. default: type = LLM_TYPE_UNKNOWN;
  638. }
  639. } break;
  640. case LLM_ARCH_REFACT:
  641. {
  642. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  643. switch (hparams.n_layer) {
  644. case 32: type = LLM_TYPE_1B; break;
  645. default: type = LLM_TYPE_UNKNOWN;
  646. }
  647. // TODO: become GGUF KV parameter
  648. hparams.f_max_alibi_bias = 8.0f;
  649. } break;
  650. case LLM_ARCH_BERT:
  651. {
  652. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  653. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  654. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  655. switch (hparams.n_layer) {
  656. case 3:
  657. type = LLM_TYPE_17M; break; // bge-micro
  658. case 6:
  659. type = LLM_TYPE_22M; break; // MiniLM-L6
  660. case 12:
  661. switch (hparams.n_embd) {
  662. case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
  663. case 768: type = LLM_TYPE_109M; break; // bge-base
  664. default: type = LLM_TYPE_UNKNOWN;
  665. } break;
  666. case 24:
  667. type = LLM_TYPE_335M; break; // bge-large
  668. default: type = LLM_TYPE_UNKNOWN;
  669. }
  670. } break;
  671. case LLM_ARCH_JINA_BERT_V2:
  672. {
  673. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  674. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  675. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  676. hparams.f_max_alibi_bias = 8.0f;
  677. switch (hparams.n_layer) {
  678. case 4: type = LLM_TYPE_33M; break; // jina-embeddings-small
  679. case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
  680. default: type = LLM_TYPE_UNKNOWN;
  681. }
  682. } break;
  683. case LLM_ARCH_JINA_BERT_V3:
  684. {
  685. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  686. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  687. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  688. switch (hparams.n_layer) {
  689. case 24:
  690. type = LLM_TYPE_558M; break;
  691. default: type = LLM_TYPE_UNKNOWN;
  692. }
  693. } break;
  694. case LLM_ARCH_NOMIC_BERT:
  695. case LLM_ARCH_NOMIC_BERT_MOE:
  696. {
  697. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  698. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  699. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  700. ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
  701. if (hparams.n_layer == 12 && hparams.n_embd == 768) {
  702. if (arch == LLM_ARCH_NOMIC_BERT) {
  703. type = LLM_TYPE_137M;
  704. } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
  705. type = LLM_TYPE_475M;
  706. }
  707. }
  708. } break;
  709. case LLM_ARCH_NEO_BERT:
  710. {
  711. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  712. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  713. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  714. if (hparams.n_layer == 28) {
  715. type = LLM_TYPE_250M;
  716. }
  717. } break;
  718. case LLM_ARCH_BLOOM:
  719. {
  720. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  721. switch (hparams.n_layer) {
  722. case 24: type = LLM_TYPE_1B; break;
  723. case 30:
  724. switch (hparams.n_embd) {
  725. case 2560: type = LLM_TYPE_3B; break;
  726. case 4096: type = LLM_TYPE_7B; break;
  727. default: type = LLM_TYPE_UNKNOWN;
  728. } break;
  729. default: type = LLM_TYPE_UNKNOWN;
  730. }
  731. // TODO: become GGUF KV parameter
  732. hparams.f_max_alibi_bias = 8.0f;
  733. } break;
  734. case LLM_ARCH_MPT:
  735. {
  736. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  737. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  738. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  739. switch (hparams.n_layer) {
  740. case 32: type = LLM_TYPE_7B; break;
  741. case 48: type = LLM_TYPE_30B; break;
  742. default: type = LLM_TYPE_UNKNOWN;
  743. }
  744. } break;
  745. case LLM_ARCH_STABLELM:
  746. {
  747. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  748. switch (hparams.n_layer) {
  749. case 24: type = LLM_TYPE_1B; break;
  750. case 32: type = LLM_TYPE_3B; break;
  751. case 40: type = LLM_TYPE_12B; break;
  752. default: type = LLM_TYPE_UNKNOWN;
  753. }
  754. } break;
  755. case LLM_ARCH_QWEN:
  756. {
  757. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  758. switch (hparams.n_layer) {
  759. case 32: type = LLM_TYPE_7B; break;
  760. case 40: type = LLM_TYPE_13B; break;
  761. default: type = LLM_TYPE_UNKNOWN;
  762. }
  763. } break;
  764. case LLM_ARCH_QWEN2VL:
  765. {
  766. ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
  767. }
  768. // fall through
  769. case LLM_ARCH_QWEN2:
  770. {
  771. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  772. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  773. switch (hparams.n_layer) {
  774. case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
  775. case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
  776. case 32: type = LLM_TYPE_7B; break;
  777. case 36: type = LLM_TYPE_3B; break;
  778. case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
  779. case 48: type = LLM_TYPE_14B; break;
  780. case 64: type = LLM_TYPE_32B; break;
  781. case 80: type = LLM_TYPE_70B; break;
  782. default: type = LLM_TYPE_UNKNOWN;
  783. }
  784. } break;
  785. case LLM_ARCH_DREAM:
  786. {
  787. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  788. // Dream models are primarily 7B with 28 layers
  789. switch (hparams.n_layer) {
  790. case 28:
  791. type = LLM_TYPE_7B;
  792. break;
  793. default:
  794. type = LLM_TYPE_UNKNOWN;
  795. }
  796. // Set non-causal attention for diffusion models
  797. hparams.causal_attn = false;
  798. }
  799. break;
  800. case LLM_ARCH_LLADA:
  801. {
  802. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  803. // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
  804. switch (hparams.n_layer) {
  805. case 32:
  806. type = LLM_TYPE_8B;
  807. break;
  808. default:
  809. type = LLM_TYPE_UNKNOWN;
  810. }
  811. // Set non-causal attention for diffusion models
  812. hparams.causal_attn = false;
  813. }
  814. break;
  815. case LLM_ARCH_QWEN2MOE:
  816. {
  817. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  818. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  819. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  820. switch (hparams.n_layer) {
  821. case 24: type = LLM_TYPE_A2_7B; break;
  822. case 28: type = LLM_TYPE_57B_A14B; break;
  823. default: type = LLM_TYPE_UNKNOWN;
  824. }
  825. } break;
  826. case LLM_ARCH_QWEN3:
  827. {
  828. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  829. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  830. switch (hparams.n_layer) {
  831. case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
  832. case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
  833. case 40: type = LLM_TYPE_14B; break;
  834. case 64: type = LLM_TYPE_32B; break;
  835. default: type = LLM_TYPE_UNKNOWN;
  836. }
  837. } break;
  838. case LLM_ARCH_QWEN3MOE:
  839. {
  840. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  841. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  842. switch (hparams.n_layer) {
  843. case 48: type = LLM_TYPE_30B_A3B; break;
  844. case 94: type = LLM_TYPE_235B_A22B; break;
  845. default: type = LLM_TYPE_UNKNOWN;
  846. }
  847. } break;
  848. case LLM_ARCH_PHI2:
  849. {
  850. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  851. switch (hparams.n_layer) {
  852. case 24: type = LLM_TYPE_1B; break;
  853. case 32: type = LLM_TYPE_3B; break;
  854. default: type = LLM_TYPE_UNKNOWN;
  855. }
  856. } break;
  857. case LLM_ARCH_PHI3:
  858. {
  859. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  860. switch (hparams.n_layer) {
  861. case 24: type = LLM_TYPE_1B; break;
  862. case 32: type = LLM_TYPE_3B; break;
  863. case 40: type = LLM_TYPE_14B; break;
  864. default: type = LLM_TYPE_UNKNOWN;
  865. }
  866. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  867. if (found_swa && hparams.n_swa > 0) {
  868. LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
  869. __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
  870. // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
  871. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  872. hparams.n_swa = 0;
  873. hparams.set_swa_pattern(1);
  874. }
  875. } break;
  876. case LLM_ARCH_PHIMOE:
  877. {
  878. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  879. switch (hparams.n_layer) {
  880. case 32: type = LLM_TYPE_16x3_8B; break;
  881. default: type = LLM_TYPE_UNKNOWN;
  882. }
  883. } break;
  884. case LLM_ARCH_PLAMO:
  885. {
  886. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  887. switch (hparams.n_layer) {
  888. case 40: type = LLM_TYPE_13B; break;
  889. default: type = LLM_TYPE_UNKNOWN;
  890. }
  891. } break;
  892. case LLM_ARCH_PLAMO2:
  893. {
  894. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  895. // Load Mamba SSM parameters
  896. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  897. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  898. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  899. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  900. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  901. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  902. hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
  903. }
  904. switch (hparams.n_layer) {
  905. case 16: type = LLM_TYPE_1B; break;
  906. case 32:
  907. if (hparams.n_embd == 2048) {
  908. type = LLM_TYPE_2B;
  909. } else if (hparams.n_embd == 4096) {
  910. type = LLM_TYPE_8B;
  911. }
  912. break;
  913. default: type = LLM_TYPE_UNKNOWN;
  914. }
  915. } break;
  916. case LLM_ARCH_GPT2:
  917. {
  918. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  919. switch (hparams.n_layer) {
  920. case 12: type = LLM_TYPE_SMALL; break;
  921. case 24: type = LLM_TYPE_MEDIUM; break;
  922. case 36: type = LLM_TYPE_LARGE; break;
  923. case 48: type = LLM_TYPE_XL; break;
  924. default: type = LLM_TYPE_UNKNOWN;
  925. }
  926. } break;
  927. case LLM_ARCH_CODESHELL:
  928. {
  929. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  930. switch (hparams.n_layer) {
  931. case 42: type = LLM_TYPE_7B; break;
  932. default: type = LLM_TYPE_UNKNOWN;
  933. }
  934. } break;
  935. case LLM_ARCH_ORION:
  936. {
  937. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  938. switch (hparams.n_layer) {
  939. case 40: type = LLM_TYPE_14B; break;
  940. default: type = LLM_TYPE_UNKNOWN;
  941. }
  942. } break;
  943. case LLM_ARCH_INTERNLM2:
  944. {
  945. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  946. switch (hparams.n_layer) {
  947. case 32: type = LLM_TYPE_7B; break;
  948. case 48: type = LLM_TYPE_20B; break;
  949. default: type = LLM_TYPE_UNKNOWN;
  950. }
  951. } break;
  952. case LLM_ARCH_GEMMA:
  953. {
  954. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  955. switch (hparams.n_layer) {
  956. case 18: type = LLM_TYPE_2B; break;
  957. case 28: type = LLM_TYPE_7B; break;
  958. default: type = LLM_TYPE_UNKNOWN;
  959. }
  960. } break;
  961. case LLM_ARCH_GEMMA2:
  962. {
  963. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  964. hparams.n_swa = 4096; // default value of gemma 2
  965. hparams.set_swa_pattern(2);
  966. hparams.attn_soft_cap = true;
  967. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  968. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  969. ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
  970. ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
  971. switch (hparams.n_layer) {
  972. case 26: type = LLM_TYPE_2B; break;
  973. case 42: type = LLM_TYPE_9B; break;
  974. case 46: type = LLM_TYPE_27B; break;
  975. default: type = LLM_TYPE_UNKNOWN;
  976. }
  977. // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
  978. hparams.f_attention_scale = type == LLM_TYPE_27B
  979. ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
  980. : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
  981. } break;
  982. case LLM_ARCH_GEMMA3:
  983. {
  984. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  985. hparams.set_swa_pattern(6);
  986. hparams.rope_freq_base_train_swa = 10000.0f;
  987. hparams.rope_freq_scale_train_swa = 1.0f;
  988. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  989. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  990. switch (hparams.n_layer) {
  991. case 18: type = LLM_TYPE_270M; break;
  992. case 26: type = LLM_TYPE_1B; break;
  993. case 34: type = LLM_TYPE_4B; break;
  994. case 48: type = LLM_TYPE_12B; break;
  995. case 62: type = LLM_TYPE_27B; break;
  996. default: type = LLM_TYPE_UNKNOWN;
  997. }
  998. // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
  999. hparams.f_attention_scale = type == LLM_TYPE_27B
  1000. ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
  1001. : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
  1002. } break;
  1003. case LLM_ARCH_GEMMA3N:
  1004. {
  1005. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1006. hparams.set_swa_pattern(5);
  1007. hparams.n_layer_kv_from_start = 20;
  1008. hparams.rope_freq_base_train_swa = 10000.0f;
  1009. hparams.rope_freq_scale_train_swa = 1.0f;
  1010. hparams.f_attention_scale = 1.0f;
  1011. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1012. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1013. switch (hparams.n_layer) {
  1014. case 30: type = LLM_TYPE_E2B; break;
  1015. case 35: type = LLM_TYPE_E4B; break;
  1016. default: type = LLM_TYPE_UNKNOWN;
  1017. }
  1018. } break;
  1019. case LLM_ARCH_GEMMA_EMBEDDING:
  1020. {
  1021. hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
  1022. hparams.set_swa_pattern(6);
  1023. hparams.causal_attn = false; // embeddings do not use causal attention
  1024. hparams.rope_freq_base_train_swa = 10000.0f;
  1025. hparams.rope_freq_scale_train_swa = 1.0f;
  1026. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1027. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1028. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  1029. switch (hparams.n_layer) {
  1030. case 24: type = LLM_TYPE_0_3B; break;
  1031. default: type = LLM_TYPE_UNKNOWN;
  1032. }
  1033. hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
  1034. } break;
  1035. case LLM_ARCH_STARCODER2:
  1036. {
  1037. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1038. switch (hparams.n_layer) {
  1039. case 30: type = LLM_TYPE_3B; break;
  1040. case 32: type = LLM_TYPE_7B; break;
  1041. case 40: type = LLM_TYPE_15B; break;
  1042. case 52: type = LLM_TYPE_20B; break; // granite
  1043. case 88: type = LLM_TYPE_34B; break; // granite
  1044. default: type = LLM_TYPE_UNKNOWN;
  1045. }
  1046. } break;
  1047. case LLM_ARCH_MAMBA:
  1048. {
  1049. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1050. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1051. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1052. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1053. ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
  1054. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1055. switch (hparams.n_layer) {
  1056. case 24:
  1057. switch (hparams.n_embd) {
  1058. case 768: type = LLM_TYPE_SMALL; break;
  1059. default: type = LLM_TYPE_UNKNOWN;
  1060. } break;
  1061. case 48:
  1062. switch (hparams.n_embd) {
  1063. case 1024: type = LLM_TYPE_MEDIUM; break;
  1064. case 1536: type = LLM_TYPE_LARGE; break;
  1065. case 2048: type = LLM_TYPE_XL; break;
  1066. default: type = LLM_TYPE_UNKNOWN;
  1067. } break;
  1068. case 64:
  1069. switch (hparams.n_embd) {
  1070. case 2560: type = LLM_TYPE_3B; break;
  1071. default: type = LLM_TYPE_UNKNOWN;
  1072. } break;
  1073. default: type = LLM_TYPE_UNKNOWN;
  1074. }
  1075. } break;
  1076. case LLM_ARCH_MAMBA2:
  1077. {
  1078. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1079. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1080. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1081. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1082. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1083. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1084. switch (hparams.n_layer) {
  1085. case 24:
  1086. switch (hparams.n_embd) {
  1087. case 768: type = LLM_TYPE_SMALL; break;
  1088. default: type = LLM_TYPE_UNKNOWN;
  1089. } break;
  1090. case 48:
  1091. switch (hparams.n_embd) {
  1092. case 1024: type = LLM_TYPE_MEDIUM; break;
  1093. case 1536: type = LLM_TYPE_LARGE; break;
  1094. case 2048: type = LLM_TYPE_XL; break;
  1095. default: type = LLM_TYPE_UNKNOWN;
  1096. } break;
  1097. case 64:
  1098. switch (hparams.n_embd) {
  1099. case 2560: type = LLM_TYPE_3B; break;
  1100. case 4096: type = LLM_TYPE_7B; break;
  1101. default: type = LLM_TYPE_UNKNOWN;
  1102. } break;
  1103. default: type = LLM_TYPE_UNKNOWN;
  1104. }
  1105. } break;
  1106. case LLM_ARCH_JAMBA:
  1107. {
  1108. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1109. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1110. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1111. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1112. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1113. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1114. hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
  1115. }
  1116. switch (hparams.n_layer) {
  1117. // TODO: Jamba layers are a bit heterogenous, so naming this is hard.
  1118. case 12: // 900M 8x???M
  1119. case 32: // 51B 16x?B
  1120. default: type = LLM_TYPE_UNKNOWN;
  1121. }
  1122. } break;
  1123. case LLM_ARCH_XVERSE:
  1124. {
  1125. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1126. switch (hparams.n_layer) {
  1127. case 32: type = LLM_TYPE_7B; break;
  1128. case 40: type = LLM_TYPE_13B; break;
  1129. case 80: type = LLM_TYPE_65B; break;
  1130. default: type = LLM_TYPE_UNKNOWN;
  1131. }
  1132. } break;
  1133. case LLM_ARCH_COMMAND_R:
  1134. {
  1135. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1136. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1137. switch (hparams.n_layer) {
  1138. case 40: type = LLM_TYPE_35B; break;
  1139. default: type = LLM_TYPE_UNKNOWN;
  1140. }
  1141. } break;
  1142. case LLM_ARCH_COHERE2:
  1143. {
  1144. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1145. hparams.set_swa_pattern(4);
  1146. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1147. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1148. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1149. switch (hparams.n_layer) {
  1150. case 32: type = LLM_TYPE_8B; break;
  1151. default: type = LLM_TYPE_UNKNOWN;
  1152. }
  1153. } break;
  1154. case LLM_ARCH_DBRX:
  1155. {
  1156. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1157. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
  1158. switch (hparams.n_layer) {
  1159. case 40: type = LLM_TYPE_16x12B; break;
  1160. default: type = LLM_TYPE_UNKNOWN;
  1161. }
  1162. } break;
  1163. case LLM_ARCH_OLMO:
  1164. {
  1165. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1166. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  1167. switch (hparams.n_layer) {
  1168. case 22: type = LLM_TYPE_1B; break;
  1169. case 32: type = LLM_TYPE_7B; break;
  1170. case 80: type = LLM_TYPE_70B; break;
  1171. default: type = LLM_TYPE_UNKNOWN;
  1172. }
  1173. } break;
  1174. case LLM_ARCH_OLMO2:
  1175. {
  1176. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1177. switch (hparams.n_layer) {
  1178. case 16: type = LLM_TYPE_1B; break;
  1179. case 32: type = LLM_TYPE_7B; break;
  1180. case 40: type = LLM_TYPE_13B; break;
  1181. case 64: type = LLM_TYPE_32B; break;
  1182. default: type = LLM_TYPE_UNKNOWN;
  1183. }
  1184. } break;
  1185. case LLM_ARCH_SEED_OSS:
  1186. {
  1187. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1188. switch (hparams.n_layer) {
  1189. case 64: type = LLM_TYPE_36B; break;
  1190. default: type = LLM_TYPE_UNKNOWN;
  1191. }
  1192. } break;
  1193. case LLM_ARCH_OLMOE:
  1194. {
  1195. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1196. switch (hparams.n_layer) {
  1197. case 16: type = LLM_TYPE_A1_7B; break;
  1198. default: type = LLM_TYPE_UNKNOWN;
  1199. }
  1200. } break;
  1201. case LLM_ARCH_OPENELM:
  1202. {
  1203. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1204. switch (hparams.n_layer) {
  1205. case 16: type = LLM_TYPE_270M; break;
  1206. case 20: type = LLM_TYPE_450M; break;
  1207. case 28: type = LLM_TYPE_1B; break;
  1208. case 36: type = LLM_TYPE_3B; break;
  1209. default: type = LLM_TYPE_UNKNOWN;
  1210. }
  1211. } break;
  1212. case LLM_ARCH_GPTNEOX:
  1213. {
  1214. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1215. ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
  1216. switch (hparams.n_layer) {
  1217. case 6:
  1218. switch (hparams.n_ff()) {
  1219. case 512: type = LLM_TYPE_14M; break;
  1220. case 2048: type = LLM_TYPE_70M; break;
  1221. default: type = LLM_TYPE_UNKNOWN;
  1222. } break;
  1223. case 12:
  1224. switch (hparams.n_ff()) {
  1225. case 3072: type = LLM_TYPE_160M; break;
  1226. default: type = LLM_TYPE_UNKNOWN;
  1227. } break;
  1228. case 16:
  1229. switch (hparams.n_ff()) {
  1230. case 8192: type = LLM_TYPE_1B; break;
  1231. default: type = LLM_TYPE_UNKNOWN;
  1232. } break;
  1233. case 24:
  1234. switch (hparams.n_ff()) {
  1235. case 4096: type = LLM_TYPE_410M; break;
  1236. case 8192: type = LLM_TYPE_1_4B; break;
  1237. default: type = LLM_TYPE_UNKNOWN;
  1238. } break;
  1239. case 32:
  1240. switch (hparams.n_ff()) {
  1241. case 10240: type = LLM_TYPE_2_8B; break;
  1242. case 16384: type = LLM_TYPE_6_9B; break;
  1243. default: type = LLM_TYPE_UNKNOWN;
  1244. } break;
  1245. case 36:
  1246. switch (hparams.n_ff()) {
  1247. case 20480: type = LLM_TYPE_12B; break;
  1248. default: type = LLM_TYPE_UNKNOWN;
  1249. } break;
  1250. case 44:
  1251. switch (hparams.n_ff()) {
  1252. case 24576: type = LLM_TYPE_20B; break;
  1253. default: type = LLM_TYPE_UNKNOWN;
  1254. } break;
  1255. default: type = LLM_TYPE_UNKNOWN;
  1256. }
  1257. } break;
  1258. case LLM_ARCH_ARCTIC:
  1259. {
  1260. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1261. if (hparams.n_expert == 128) {
  1262. switch (hparams.n_layer) {
  1263. case 35: type = LLM_TYPE_10B_128x3_66B; break;
  1264. default: type = LLM_TYPE_UNKNOWN;
  1265. }
  1266. } else {
  1267. type = LLM_TYPE_UNKNOWN;
  1268. }
  1269. } break;
  1270. case LLM_ARCH_DEEPSEEK:
  1271. {
  1272. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1273. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1274. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1275. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1276. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1277. switch (hparams.n_layer) {
  1278. case 28: type = LLM_TYPE_20B; break;
  1279. default: type = LLM_TYPE_UNKNOWN;
  1280. }
  1281. } break;
  1282. case LLM_ARCH_DEEPSEEK2:
  1283. {
  1284. bool is_lite = (hparams.n_layer == 27);
  1285. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1286. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1287. if (!is_lite) {
  1288. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  1289. }
  1290. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  1291. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
  1292. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
  1293. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1294. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1295. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1296. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1297. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1298. if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
  1299. // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
  1300. // that have no expert_gating_func model parameter set
  1301. hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
  1302. }
  1303. ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
  1304. switch (hparams.n_layer) {
  1305. case 27: type = LLM_TYPE_16B; break;
  1306. case 60: type = LLM_TYPE_236B; break;
  1307. case 61: type = LLM_TYPE_671B; break;
  1308. default: type = LLM_TYPE_UNKNOWN;
  1309. }
  1310. } break;
  1311. case LLM_ARCH_PLM:
  1312. {
  1313. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1314. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  1315. switch (hparams.n_layer) {
  1316. case 32: type = LLM_TYPE_1_8B; break;
  1317. default: type = LLM_TYPE_UNKNOWN;
  1318. }
  1319. } break;
  1320. case LLM_ARCH_CHATGLM:
  1321. {
  1322. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1323. switch (hparams.n_layer) {
  1324. case 28: {
  1325. if (hparams.n_head(0) == 16) {
  1326. type = LLM_TYPE_1_5B;
  1327. } else {
  1328. type = LLM_TYPE_6B;
  1329. }
  1330. } break;
  1331. case 40: {
  1332. if (hparams.n_head(0) == 24) {
  1333. type = LLM_TYPE_4B;
  1334. } else {
  1335. type = LLM_TYPE_9B;
  1336. }
  1337. } break;
  1338. default: type = LLM_TYPE_UNKNOWN;
  1339. }
  1340. } break;
  1341. case LLM_ARCH_GLM4:
  1342. {
  1343. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1344. switch (hparams.n_layer) {
  1345. case 40: type = LLM_TYPE_9B; break;
  1346. case 61: type = LLM_TYPE_32B; break;
  1347. default: type = LLM_TYPE_UNKNOWN;
  1348. }
  1349. } break;
  1350. case LLM_ARCH_GLM4_MOE:
  1351. {
  1352. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1353. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1354. // MoE parameters
  1355. ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
  1356. ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
  1357. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1358. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
  1359. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1360. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1361. // Expert gating function (GLM-4.5 uses sigmoid)
  1362. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1363. if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
  1364. hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
  1365. }
  1366. // NextN/MTP parameters
  1367. ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
  1368. // TODO: when MTP is implemented, this should probably be updated if needed
  1369. hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
  1370. switch (hparams.n_layer) {
  1371. case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
  1372. case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
  1373. default: type = LLM_TYPE_UNKNOWN;
  1374. }
  1375. } break;
  1376. case LLM_ARCH_BITNET:
  1377. {
  1378. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1379. switch (hparams.n_layer) {
  1380. case 26: type = LLM_TYPE_3B; break;
  1381. default: type = LLM_TYPE_UNKNOWN;
  1382. }
  1383. } break;
  1384. case LLM_ARCH_T5:
  1385. {
  1386. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1387. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  1388. uint32_t dec_start_token_id;
  1389. if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
  1390. hparams.dec_start_token_id = dec_start_token_id;
  1391. }
  1392. switch (hparams.n_layer) {
  1393. case 6: type = LLM_TYPE_60M; break; // t5-small
  1394. case 8: type = LLM_TYPE_80M; break; // flan-t5-small
  1395. case 12:
  1396. switch (hparams.n_ff()) {
  1397. case 3072: type = LLM_TYPE_220M; break; // t5-base
  1398. case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
  1399. default: type = LLM_TYPE_UNKNOWN;
  1400. } break;
  1401. case 24:
  1402. switch (hparams.n_ff()) {
  1403. case 4096: type = LLM_TYPE_770M; break; // t5-large
  1404. case 2816: type = LLM_TYPE_780M; break; // flan-t5-large
  1405. case 16384: type = LLM_TYPE_3B; break; // t5-3b
  1406. case 5120: type = LLM_TYPE_3B; break; // flan-t5-xl
  1407. case 65536: type = LLM_TYPE_11B; break; // t5-11b
  1408. case 10240: type = LLM_TYPE_11B; break; // flan-t5-xxl
  1409. default: type = LLM_TYPE_UNKNOWN;
  1410. } break;
  1411. default: type = LLM_TYPE_UNKNOWN;
  1412. }
  1413. } break;
  1414. case LLM_ARCH_T5ENCODER:
  1415. {
  1416. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1417. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  1418. type = LLM_TYPE_UNKNOWN;
  1419. } break;
  1420. case LLM_ARCH_JAIS:
  1421. {
  1422. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1423. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  1424. switch (hparams.n_layer) {
  1425. case 24: type = LLM_TYPE_1_3B; break;
  1426. case 40: type = LLM_TYPE_13B; break;
  1427. /* TODO: add variants */
  1428. default: type = LLM_TYPE_UNKNOWN;
  1429. }
  1430. } break;
  1431. case LLM_ARCH_NEMOTRON:
  1432. {
  1433. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1434. switch (hparams.n_layer) {
  1435. case 32: type = LLM_TYPE_4B; break;
  1436. default: type = LLM_TYPE_UNKNOWN;
  1437. }
  1438. } break;
  1439. case LLM_ARCH_NEMOTRON_H:
  1440. {
  1441. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1442. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1443. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1444. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1445. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1446. // A layer is recurrent IFF the n_head_kv value is set to 0 and
  1447. // the n_ff value is set to 0
  1448. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1449. hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
  1450. }
  1451. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1452. switch (hparams.n_layer) {
  1453. case 56: type = LLM_TYPE_9B; break;
  1454. default: type = LLM_TYPE_UNKNOWN;
  1455. }
  1456. } break;
  1457. case LLM_ARCH_EXAONE:
  1458. {
  1459. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1460. switch (hparams.n_layer) {
  1461. case 32: type = LLM_TYPE_8B; break;
  1462. default: type = LLM_TYPE_UNKNOWN;
  1463. }
  1464. } break;
  1465. case LLM_ARCH_EXAONE4:
  1466. {
  1467. if (hparams.n_layer == 64) { // 32B
  1468. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1469. hparams.n_swa = 4096;
  1470. hparams.set_swa_pattern(4);
  1471. }
  1472. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1473. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1474. switch (hparams.n_layer) {
  1475. case 30: type = LLM_TYPE_1_2B; break;
  1476. case 64: type = LLM_TYPE_32B; break;
  1477. default: type = LLM_TYPE_UNKNOWN;
  1478. }
  1479. } break;
  1480. case LLM_ARCH_RWKV6:
  1481. case LLM_ARCH_RWKV6QWEN2:
  1482. {
  1483. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
  1484. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
  1485. ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
  1486. ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
  1487. ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
  1488. ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
  1489. ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
  1490. switch (hparams.n_layer) {
  1491. case 24: type = LLM_TYPE_1_6B; break;
  1492. case 32:
  1493. switch (hparams.n_embd) {
  1494. case 2560: type = LLM_TYPE_3B; break;
  1495. case 4096: type = LLM_TYPE_7B; break;
  1496. default: type = LLM_TYPE_UNKNOWN;
  1497. } break;
  1498. case 61: type = LLM_TYPE_14B; break;
  1499. case 64: type = LLM_TYPE_32B; break;
  1500. default: type = LLM_TYPE_UNKNOWN;
  1501. }
  1502. } break;
  1503. case LLM_ARCH_RWKV7:
  1504. case LLM_ARCH_ARWKV7:
  1505. {
  1506. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
  1507. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
  1508. ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
  1509. ml.get_key(LLM_KV_ATTENTION_DECAY_LORA_RANK, hparams.n_lora_decay);
  1510. ml.get_key(LLM_KV_ATTENTION_ICLR_LORA_RANK, hparams.n_lora_iclr);
  1511. ml.get_key(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix);
  1512. ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate, false);
  1513. ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
  1514. switch (hparams.n_layer) {
  1515. case 12:
  1516. switch (hparams.n_embd) {
  1517. case 768: type = LLM_TYPE_190M; break;
  1518. default: type = LLM_TYPE_UNKNOWN;
  1519. } break;
  1520. case 24:
  1521. switch (hparams.n_embd) {
  1522. case 1024: type = LLM_TYPE_450M; break;
  1523. case 2048: type = LLM_TYPE_1_5B; break;
  1524. default: type = LLM_TYPE_UNKNOWN;
  1525. } break;
  1526. case 28:
  1527. switch (hparams.n_embd) {
  1528. case 1536: type = LLM_TYPE_1_5B; break;
  1529. case 3584: type = LLM_TYPE_7B; break;
  1530. default: type = LLM_TYPE_UNKNOWN;
  1531. } break;
  1532. case 32:
  1533. switch (hparams.n_embd) {
  1534. case 2560: type = LLM_TYPE_2_9B; break;
  1535. case 4096: type = LLM_TYPE_7B; break;
  1536. default: type = LLM_TYPE_UNKNOWN;
  1537. } break;
  1538. case 61:
  1539. switch (hparams.n_embd) {
  1540. case 4096: type = LLM_TYPE_14B; break;
  1541. default: type = LLM_TYPE_UNKNOWN;
  1542. } break;
  1543. default: type = LLM_TYPE_UNKNOWN;
  1544. }
  1545. } break;
  1546. case LLM_ARCH_GRANITE:
  1547. case LLM_ARCH_GRANITE_MOE:
  1548. {
  1549. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1550. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1551. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  1552. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  1553. ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
  1554. // Granite uses rope_finetuned as a switch for rope, so default to true
  1555. bool rope_finetuned = true;
  1556. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  1557. hparams.rope_finetuned = rope_finetuned;
  1558. switch (hparams.n_layer) {
  1559. case 32: type = LLM_TYPE_3B; break;
  1560. case 40: type = LLM_TYPE_3B; break;
  1561. // Add additional layer/vocab/etc checks here for other model sizes
  1562. default: type = LLM_TYPE_UNKNOWN;
  1563. }
  1564. // For Granite MoE Shared
  1565. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
  1566. } break;
  1567. case LLM_ARCH_GRANITE_HYBRID:
  1568. {
  1569. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1570. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /* required */ false);
  1571. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /* required */ false);
  1572. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /* required */ false);
  1573. ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, /* required */ false);
  1574. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1575. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1576. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1577. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1578. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1579. // Granite uses rope_finetuned as a switch for rope, so default to true
  1580. bool rope_finetuned = true;
  1581. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  1582. hparams.rope_finetuned = rope_finetuned;
  1583. // A layer is recurrent IFF the n_head_kv value is set to 0
  1584. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1585. hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
  1586. }
  1587. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1588. switch (hparams.n_layer) {
  1589. // TODO: Add llm type label (not sure this is useful)
  1590. default: type = LLM_TYPE_UNKNOWN;
  1591. }
  1592. // For Granite MoE Shared
  1593. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
  1594. } break;
  1595. case LLM_ARCH_CHAMELEON:
  1596. {
  1597. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1598. hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
  1599. ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
  1600. switch (hparams.n_layer) {
  1601. case 32: type = LLM_TYPE_7B; break;
  1602. case 48: type = LLM_TYPE_34B; break;
  1603. default: type = LLM_TYPE_UNKNOWN;
  1604. }
  1605. } break;
  1606. case LLM_ARCH_WAVTOKENIZER_DEC:
  1607. {
  1608. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1609. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
  1610. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
  1611. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  1612. } break;
  1613. case LLM_ARCH_BAILINGMOE:
  1614. {
  1615. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1616. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1617. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1618. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1619. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1620. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1621. switch (hparams.n_layer) {
  1622. case 28: type = LLM_TYPE_16B; break;
  1623. case 88: type = LLM_TYPE_290B; break;
  1624. default: type = LLM_TYPE_UNKNOWN;
  1625. }
  1626. } break;
  1627. case LLM_ARCH_DOTS1:
  1628. {
  1629. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1630. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1631. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1632. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1633. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1634. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1635. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1636. switch (hparams.n_layer) {
  1637. case 62: type = LLM_TYPE_142B; break;
  1638. default: type = LLM_TYPE_UNKNOWN;
  1639. }
  1640. } break;
  1641. case LLM_ARCH_ERNIE4_5:
  1642. case LLM_ARCH_ERNIE4_5_MOE:
  1643. {
  1644. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1645. if (arch == LLM_ARCH_ERNIE4_5_MOE) {
  1646. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1647. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  1648. ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
  1649. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1650. }
  1651. switch (hparams.n_layer) {
  1652. case 18: type = LLM_TYPE_0_3B; break;
  1653. case 28: type = LLM_TYPE_21B_A3B; break;
  1654. case 54: type = LLM_TYPE_300B_A47B; break;
  1655. default: type = LLM_TYPE_UNKNOWN;
  1656. }
  1657. } break;
  1658. case LLM_ARCH_FALCON_H1:
  1659. {
  1660. // Common parameters
  1661. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1662. // SSM parameters
  1663. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1664. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1665. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1666. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1667. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1668. std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
  1669. switch (hparams.n_layer) {
  1670. case 36:
  1671. type = LLM_TYPE_0_5B; break;
  1672. case 24:
  1673. type = LLM_TYPE_1_5B; break;
  1674. case 66:
  1675. type = LLM_TYPE_1B; break;
  1676. case 32:
  1677. type = LLM_TYPE_3B; break;
  1678. case 44:
  1679. type = LLM_TYPE_7B; break;
  1680. case 72:
  1681. type = LLM_TYPE_34B; break;
  1682. default:
  1683. type = LLM_TYPE_UNKNOWN;
  1684. }
  1685. } break;
  1686. case LLM_ARCH_HUNYUAN_MOE:
  1687. {
  1688. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1689. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1690. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
  1691. switch (hparams.n_layer) {
  1692. case 32: type = LLM_TYPE_A13B; break;
  1693. default: type = LLM_TYPE_UNKNOWN;
  1694. }
  1695. } break;
  1696. case LLM_ARCH_HUNYUAN_DENSE:
  1697. {
  1698. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1699. switch (hparams.n_embd) {
  1700. case 1024: type = LLM_TYPE_0_5B; break;
  1701. case 2048: type = LLM_TYPE_1_8B; break;
  1702. case 3072: type = LLM_TYPE_4B; break;
  1703. case 4096: type = LLM_TYPE_7B; break;
  1704. default: type = LLM_TYPE_UNKNOWN;
  1705. }
  1706. } break;
  1707. case LLM_ARCH_SMOLLM3:
  1708. {
  1709. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1710. hparams.n_no_rope_layer_step = 4;
  1711. switch (hparams.n_layer) {
  1712. case 36: type = LLM_TYPE_3B; break;
  1713. default: type = LLM_TYPE_UNKNOWN;
  1714. }
  1715. } break;
  1716. case LLM_ARCH_OPENAI_MOE:
  1717. {
  1718. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1719. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1720. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1721. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1722. hparams.set_swa_pattern(2);
  1723. switch (hparams.n_layer) {
  1724. case 24: type = LLM_TYPE_20B; break;
  1725. case 36: type = LLM_TYPE_120B; break;
  1726. default: type = LLM_TYPE_UNKNOWN;
  1727. }
  1728. } break;
  1729. case LLM_ARCH_LFM2:
  1730. {
  1731. ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
  1732. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1733. for (uint32_t il = 0; il < hparams.n_layer; ++il) {
  1734. hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
  1735. }
  1736. switch (hparams.n_embd) {
  1737. case 1024: type = LLM_TYPE_350M; break;
  1738. case 1536: type = LLM_TYPE_700M; break;
  1739. case 2048: type = LLM_TYPE_1_2B; break;
  1740. default: type = LLM_TYPE_UNKNOWN;
  1741. }
  1742. } break;
  1743. case LLM_ARCH_SMALLTHINKER:
  1744. {
  1745. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1746. if (found_swa && hparams.n_swa > 0) {
  1747. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1748. hparams.n_swa = 4096;
  1749. hparams.set_swa_pattern(4, true);
  1750. } else {
  1751. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  1752. hparams.n_no_rope_layer_step = hparams.n_layer;
  1753. }
  1754. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  1755. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1756. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1757. switch (hparams.n_layer) {
  1758. case 32: type = LLM_TYPE_4B; break;
  1759. case 52: type = LLM_TYPE_20B; break;
  1760. default: type = LLM_TYPE_UNKNOWN;
  1761. }
  1762. } break;
  1763. default: throw std::runtime_error("unsupported model architecture");
  1764. }
  1765. pimpl->n_bytes = ml.n_bytes;
  1766. pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
  1767. if (hparams.f_max_alibi_bias > 0.0f) {
  1768. hparams.use_alibi = true;
  1769. }
  1770. hparams.rope_type = llama_model_rope_type(this);
  1771. }
  1772. void llama_model::load_vocab(llama_model_loader & ml) {
  1773. const auto kv = LLM_KV(arch);
  1774. vocab.load(ml, kv);
  1775. }
  1776. bool llama_model::load_tensors(llama_model_loader & ml) {
  1777. const auto & split_mode = params.split_mode;
  1778. const auto & n_gpu_layers = params.n_gpu_layers;
  1779. const auto & use_mlock = params.use_mlock;
  1780. const auto & tensor_split = params.tensor_split;
  1781. const int n_layer = hparams.n_layer;
  1782. const bool use_mmap_buffer = true;
  1783. LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
  1784. // build a list of buffer types for the CPU and GPU devices
  1785. pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts);
  1786. for (auto * dev : devices) {
  1787. buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
  1788. // add CPU buffer types as a fallback
  1789. buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
  1790. pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
  1791. }
  1792. // calculate the split points
  1793. bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
  1794. std::vector<float> splits(n_devices());
  1795. if (all_zero) {
  1796. // default split, by free memory
  1797. for (size_t i = 0; i < n_devices(); ++i) {
  1798. ggml_backend_dev_t dev = devices[i];
  1799. size_t total;
  1800. size_t free;
  1801. ggml_backend_dev_memory(dev, &free, &total);
  1802. splits[i] = free;
  1803. }
  1804. } else {
  1805. std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
  1806. }
  1807. // sum and normalize the splits to get the split points
  1808. float split_sum = 0.0f;
  1809. for (size_t i = 0; i < n_devices(); ++i) {
  1810. split_sum += splits[i];
  1811. splits[i] = split_sum;
  1812. }
  1813. for (size_t i = 0; i < n_devices(); ++i) {
  1814. splits[i] /= split_sum;
  1815. }
  1816. ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  1817. if (cpu_dev == nullptr) {
  1818. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  1819. }
  1820. const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
  1821. const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
  1822. auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
  1823. const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
  1824. if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
  1825. LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
  1826. return {cpu_dev, &pimpl->cpu_buft_list};
  1827. }
  1828. const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
  1829. auto * dev = devices.at(layer_gpu);
  1830. LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
  1831. return {dev, &pimpl->gpu_buft_list.at(dev)};
  1832. };
  1833. // assign the input layer
  1834. // there is very little benefit to offloading the input layer, so always keep it on the CPU
  1835. pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
  1836. // assign the repeating layers to the devices according to the splits
  1837. pimpl->dev_layer.resize(n_layer);
  1838. for (int il = 0; il < n_layer; ++il) {
  1839. pimpl->dev_layer[il] = get_layer_buft_list(il);
  1840. }
  1841. // assign the output layer
  1842. pimpl->dev_output = get_layer_buft_list(n_layer);
  1843. // one ggml context per buffer type
  1844. int max_n_tensors = ml.n_tensors;
  1845. max_n_tensors += 1; // duplicated output tensor
  1846. max_n_tensors += n_layer*2; // duplicated rope freq tensors
  1847. const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
  1848. std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
  1849. auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
  1850. auto it = ctx_map.find(buft);
  1851. if (it == ctx_map.end()) {
  1852. ggml_init_params params = {
  1853. /*.mem_size =*/ ctx_size,
  1854. /*.mem_buffer =*/ NULL,
  1855. /*.no_alloc =*/ true,
  1856. };
  1857. ggml_context * ctx = ggml_init(params);
  1858. if (!ctx) {
  1859. throw std::runtime_error(format("failed to create ggml context"));
  1860. }
  1861. ctx_map[buft] = ctx;
  1862. pimpl->ctxs.emplace_back(ctx);
  1863. return ctx;
  1864. }
  1865. return it->second;
  1866. };
  1867. const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
  1868. const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
  1869. const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
  1870. // create tensors for the weights
  1871. {
  1872. // note: cast to int64_t since we will use these for the tensor dimensions
  1873. const int64_t n_head = hparams.n_head();
  1874. const int64_t n_head_kv = hparams.n_head_kv();
  1875. const int64_t n_embd = hparams.n_embd;
  1876. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
  1877. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
  1878. const int64_t n_embd_head_k = hparams.n_embd_head_k;
  1879. const int64_t n_embd_head_v = hparams.n_embd_head_v;
  1880. const int64_t n_ff = hparams.n_ff();
  1881. const int64_t n_embd_gqa = n_embd_v_gqa;
  1882. const int64_t n_vocab = vocab.n_tokens();
  1883. const int64_t n_token_types = vocab.n_token_types();
  1884. const int64_t n_rot = hparams.n_rot;
  1885. const int64_t n_expert = hparams.n_expert;
  1886. const int64_t n_expert_used = hparams.n_expert_used;
  1887. const int64_t n_ctx_train = hparams.n_ctx_train;
  1888. if (n_expert > 0 && hparams.n_expert_used == 0) {
  1889. throw std::runtime_error("model has expert layers but no expert layers are used");
  1890. }
  1891. int n_moved_tensors = 0;
  1892. ggml_tensor * first_moved_tensor = nullptr;
  1893. ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
  1894. ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
  1895. auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
  1896. ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
  1897. if (!t_meta) {
  1898. if (flags & TENSOR_NOT_REQUIRED) {
  1899. return nullptr;
  1900. }
  1901. throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
  1902. }
  1903. // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
  1904. // the tensor is duplicated
  1905. // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
  1906. llm_tensor tn_tensor = tn.tensor;
  1907. if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
  1908. tn_tensor = LLM_TENSOR_OUTPUT;
  1909. }
  1910. llm_tensor_info info;
  1911. try {
  1912. info = llm_tensor_info_for(tn_tensor);
  1913. } catch (const std::out_of_range & e) {
  1914. throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
  1915. }
  1916. // skip unused tensors
  1917. if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
  1918. const size_t nbytes = ggml_nbytes(t_meta);
  1919. LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
  1920. ml.size_data -= nbytes;
  1921. ml.n_created++;
  1922. return nullptr;
  1923. }
  1924. // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
  1925. ggml_op op;
  1926. bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
  1927. if (bias) {
  1928. if (info.op == GGML_OP_MUL_MAT_ID) {
  1929. op = GGML_OP_ADD_ID;
  1930. } else {
  1931. op = GGML_OP_ADD;
  1932. }
  1933. } else {
  1934. op = info.op;
  1935. }
  1936. // sanity checks
  1937. if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
  1938. if (tn.bid != -1) {
  1939. GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
  1940. }
  1941. } else {
  1942. if (tn.bid == -1) {
  1943. GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
  1944. }
  1945. }
  1946. // select the buffer type for this tensor
  1947. buft_list_t * buft_list;
  1948. switch (info.layer) {
  1949. case LLM_TENSOR_LAYER_INPUT:
  1950. buft_list = pimpl->dev_input.buft_list;
  1951. break;
  1952. case LLM_TENSOR_LAYER_OUTPUT:
  1953. buft_list = pimpl->dev_output.buft_list;
  1954. break;
  1955. case LLM_TENSOR_LAYER_REPEATING:
  1956. buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
  1957. break;
  1958. default:
  1959. GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
  1960. }
  1961. ggml_backend_buffer_type_t buft = nullptr;
  1962. // check overrides
  1963. if (ml.tensor_buft_overrides) {
  1964. std::string tensor_name = tn.str();
  1965. for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
  1966. std::regex pattern(overrides->pattern);
  1967. if (std::regex_search(tensor_name, pattern)) {
  1968. if (overrides->buft == ggml_backend_cpu_buffer_type()) {
  1969. // when overriding to a CPU buffer, consider the extra buffer types
  1970. buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
  1971. } else {
  1972. buft = overrides->buft;
  1973. }
  1974. LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
  1975. tensor_name.c_str(),
  1976. ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
  1977. ggml_backend_buft_name(buft));
  1978. break;
  1979. }
  1980. }
  1981. }
  1982. if (!buft) {
  1983. buft = select_weight_buft(hparams, t_meta, op, *buft_list);
  1984. if (!buft) {
  1985. throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
  1986. }
  1987. }
  1988. // avoid using a host buffer when using mmap
  1989. auto * buft_dev = ggml_backend_buft_get_device(buft);
  1990. if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
  1991. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  1992. if (!cpu_dev) {
  1993. throw std::runtime_error("no CPU backend found");
  1994. }
  1995. buft = ggml_backend_dev_buffer_type(cpu_dev);
  1996. }
  1997. if (buft != buft_list->front().second) {
  1998. n_moved_tensors++;
  1999. if (!first_moved_tensor) {
  2000. first_moved_tensor = t_meta;
  2001. first_moved_from_buft = buft_list->front().second;
  2002. first_moved_to_buft = buft;
  2003. }
  2004. }
  2005. ggml_context * ctx = ctx_for_buft(buft);
  2006. // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
  2007. if (flags & TENSOR_DUPLICATED) {
  2008. ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
  2009. if (t) {
  2010. return t;
  2011. }
  2012. }
  2013. return ml.create_tensor(ctx, tn, ne, flags);
  2014. };
  2015. layers.resize(n_layer);
  2016. // TODO: move to a separate function
  2017. const auto tn = LLM_TN(arch);
  2018. switch (arch) {
  2019. case LLM_ARCH_LLAMA:
  2020. case LLM_ARCH_REFACT:
  2021. case LLM_ARCH_MINICPM:
  2022. case LLM_ARCH_GRANITE:
  2023. case LLM_ARCH_GRANITE_MOE:
  2024. {
  2025. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2026. // output
  2027. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2028. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2029. // if output is NULL, init from the input tok embed
  2030. if (output == NULL) {
  2031. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2032. }
  2033. for (int i = 0; i < n_layer; ++i) {
  2034. auto & layer = layers[i];
  2035. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2036. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2037. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2038. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2039. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2040. // optional bias tensors
  2041. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2042. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2043. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2044. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2045. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2046. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  2047. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2048. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2049. }
  2050. else {
  2051. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2052. }
  2053. if (n_expert == 0) {
  2054. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2055. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2056. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2057. // optional MLP bias
  2058. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2059. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2060. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2061. } else {
  2062. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2063. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
  2064. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  2065. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2066. // For Granite MoE Shared
  2067. if (hparams.n_ff_shexp > 0) {
  2068. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  2069. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  2070. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
  2071. }
  2072. }
  2073. }
  2074. } break;
  2075. case LLM_ARCH_LLADA:
  2076. {
  2077. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2078. // output
  2079. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2080. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
  2081. // if output is NULL, init from the input tok embed
  2082. if (output == NULL) {
  2083. output =
  2084. create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
  2085. }
  2086. for (int i = 0; i < n_layer; ++i) {
  2087. auto & layer = layers[i];
  2088. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  2089. // Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
  2090. layer.wq =
  2091. create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
  2092. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
  2093. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
  2094. // No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
  2095. layer.wo =
  2096. create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
  2097. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
  2098. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  2099. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
  2100. TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2101. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
  2102. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  2103. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
  2104. // optional MLP bias
  2105. layer.ffn_gate_b =
  2106. create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
  2107. layer.ffn_down_b =
  2108. create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
  2109. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
  2110. }
  2111. }
  2112. break;
  2113. case LLM_ARCH_LLAMA4:
  2114. {
  2115. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2116. // output
  2117. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2118. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2119. // if output is NULL, init from the input tok embed
  2120. if (output == NULL) {
  2121. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2122. }
  2123. GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0");
  2124. for (int i = 0; i < n_layer; ++i) {
  2125. bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0;
  2126. auto & layer = layers[i];
  2127. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2128. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2129. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2130. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2131. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2132. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2133. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2134. if (is_moe_layer) {
  2135. int n_ff_exp = hparams.n_ff_exp;
  2136. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2137. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  2138. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
  2139. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  2140. // Shared expert
  2141. const int64_t n_ff_shexp = n_ff_exp;
  2142. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2143. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
  2144. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2145. } else {
  2146. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2147. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2148. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2149. }
  2150. }
  2151. } break;
  2152. case LLM_ARCH_DECI:
  2153. {
  2154. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2155. // output
  2156. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2157. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2158. // if output is NULL, init from the input tok embed
  2159. if (output == NULL) {
  2160. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2161. }
  2162. for (int i = 0; i < n_layer; ++i) {
  2163. auto & layer = layers[i];
  2164. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
  2165. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
  2166. const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
  2167. const int64_t n_ff = hparams.n_ff(i);
  2168. const int64_t n_head = hparams.n_head(i);
  2169. const int64_t n_head_kv = hparams.n_head_kv(i);
  2170. if (n_head_kv == 0 && n_head > 0) {
  2171. // linear attention for DeciLMCausalModel
  2172. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2173. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2174. }
  2175. else if (n_head_kv > 0) {
  2176. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2177. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2178. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2179. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2180. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2181. }
  2182. // optional bias tensors
  2183. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2184. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2185. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2186. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2187. if (n_ff > 0) {
  2188. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2189. }
  2190. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  2191. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2192. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2193. }
  2194. else {
  2195. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2196. }
  2197. if (n_ff > 0) {
  2198. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2199. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2200. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2201. }
  2202. // optional MLP bias
  2203. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2204. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2205. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2206. }
  2207. } break;
  2208. case LLM_ARCH_MINICPM3:
  2209. {
  2210. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  2211. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  2212. const int64_t q_lora_rank = hparams.n_lora_q;
  2213. const int64_t kv_lora_rank = hparams.n_lora_kv;
  2214. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2215. // output
  2216. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2217. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2218. // if output is NULL, init from the input tok embed
  2219. if (output == NULL) {
  2220. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2221. }
  2222. for (int i = 0; i < n_layer; ++i) {
  2223. auto & layer = layers[i];
  2224. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2225. layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
  2226. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  2227. layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
  2228. layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
  2229. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
  2230. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
  2231. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
  2232. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2233. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2234. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2235. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2236. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2237. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2238. }
  2239. } break;
  2240. case LLM_ARCH_GROK:
  2241. {
  2242. if (n_expert == 0) {
  2243. throw std::runtime_error("Grok model cannot have zero experts");
  2244. }
  2245. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2246. // output
  2247. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2248. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2249. // if output is NULL, init from the input tok embed
  2250. if (output == NULL) {
  2251. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2252. }
  2253. for (int i = 0; i < n_layer; ++i) {
  2254. auto & layer = layers[i];
  2255. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2256. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2257. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2258. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2259. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2260. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  2261. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2262. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2263. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
  2264. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  2265. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2266. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  2267. }
  2268. } break;
  2269. case LLM_ARCH_DBRX:
  2270. {
  2271. if (n_expert == 0) {
  2272. throw std::runtime_error("DBRX model cannot have zero experts");
  2273. }
  2274. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2275. // output
  2276. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2277. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2278. for (int i = 0; i < n_layer; ++i) {
  2279. auto & layer = layers[i];
  2280. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2281. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2282. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2283. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  2284. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2285. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2286. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  2287. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2288. }
  2289. } break;
  2290. case LLM_ARCH_BAICHUAN:
  2291. {
  2292. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2293. {
  2294. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2295. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2296. }
  2297. for (int i = 0; i < n_layer; ++i) {
  2298. auto & layer = layers[i];
  2299. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2300. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2301. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2302. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2303. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2304. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2305. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2306. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2307. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2308. }
  2309. } break;
  2310. case LLM_ARCH_FALCON:
  2311. {
  2312. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2313. // output
  2314. {
  2315. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2316. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2317. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2318. if (!output) {
  2319. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
  2320. }
  2321. }
  2322. for (int i = 0; i < n_layer; ++i) {
  2323. auto & layer = layers[i];
  2324. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2325. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2326. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2327. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2328. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2329. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2330. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2331. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2332. }
  2333. } break;
  2334. case LLM_ARCH_STARCODER:
  2335. {
  2336. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2337. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  2338. // output
  2339. {
  2340. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2341. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2342. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2343. if (!output) {
  2344. // needs to be on GPU
  2345. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2346. }
  2347. }
  2348. for (int i = 0; i < n_layer; ++i) {
  2349. auto & layer = layers[i];
  2350. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2351. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2352. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2353. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2354. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2355. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2356. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2357. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2358. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2359. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2360. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2361. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2362. }
  2363. } break;
  2364. case LLM_ARCH_BERT:
  2365. case LLM_ARCH_NOMIC_BERT:
  2366. case LLM_ARCH_NOMIC_BERT_MOE:
  2367. case LLM_ARCH_JINA_BERT_V3:
  2368. {
  2369. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2370. type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
  2371. if (arch == LLM_ARCH_BERT) {
  2372. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  2373. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
  2374. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  2375. cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2376. cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2377. }
  2378. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  2379. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  2380. for (int i = 0; i < n_layer; ++i) {
  2381. auto & layer = layers[i];
  2382. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2383. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2384. if (!layer.wqkv) {
  2385. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2386. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2387. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2388. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2389. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2390. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2391. }
  2392. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2393. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2394. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  2395. layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
  2396. if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
  2397. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
  2398. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  2399. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2400. } else {
  2401. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2402. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2403. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2404. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2405. if (arch == LLM_ARCH_NOMIC_BERT) {
  2406. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2407. }
  2408. }
  2409. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  2410. layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
  2411. }
  2412. } break;
  2413. case LLM_ARCH_NEO_BERT:
  2414. {
  2415. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2416. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
  2417. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  2418. cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2419. cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2420. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2421. for (int i = 0; i < n_layer; ++i) {
  2422. auto & layer = layers[i];
  2423. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2424. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2425. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2426. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2427. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff*2}, 0);
  2428. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2429. }
  2430. } break;
  2431. case LLM_ARCH_JINA_BERT_V2:
  2432. {
  2433. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
  2434. type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
  2435. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
  2436. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); //LayerNorm bias
  2437. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
  2438. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {1}, TENSOR_NOT_REQUIRED);
  2439. for (int i = 0; i < n_layer; ++i) {
  2440. auto & layer = layers[i]; // JinaBertLayer
  2441. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2442. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2443. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2444. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2445. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2446. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2447. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2448. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2449. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2450. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2451. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
  2452. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
  2453. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
  2454. layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
  2455. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2456. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2457. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2458. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
  2459. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2460. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2461. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  2462. layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
  2463. }
  2464. } break;
  2465. case LLM_ARCH_BLOOM:
  2466. {
  2467. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2468. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  2469. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  2470. // output
  2471. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2472. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2473. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2474. // if output is NULL, init from the input tok embed
  2475. if (output == NULL) {
  2476. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2477. }
  2478. for (int i = 0; i < n_layer; ++i) {
  2479. auto & layer = layers[i];
  2480. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2481. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2482. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2483. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2484. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2485. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2486. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2487. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2488. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2489. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2490. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2491. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2492. }
  2493. } break;
  2494. case LLM_ARCH_MPT:
  2495. {
  2496. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2497. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
  2498. // output
  2499. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2500. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  2501. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2502. if (!output) {
  2503. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
  2504. }
  2505. for (int i = 0; i < n_layer; ++i) {
  2506. auto & layer = layers[i];
  2507. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2508. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2509. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2510. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2511. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2512. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2513. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2514. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2515. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2516. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2517. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2518. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2519. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2520. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2521. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2522. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2523. // AWQ ScaleActivation layer
  2524. layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2525. }
  2526. } break;
  2527. case LLM_ARCH_STABLELM:
  2528. {
  2529. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2530. // output
  2531. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2532. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2533. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2534. for (int i = 0; i < n_layer; ++i) {
  2535. auto & layer = layers[i];
  2536. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2537. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2538. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2539. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2540. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2541. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2542. // optional bias tensors, present in Stable LM 2 1.6B
  2543. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2544. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2545. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2546. // optional q and k layernorms, present in StableLM 2 12B
  2547. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
  2548. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
  2549. // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
  2550. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2551. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2552. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2553. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2554. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2555. }
  2556. } break;
  2557. case LLM_ARCH_QWEN:
  2558. {
  2559. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2560. // output
  2561. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2562. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2563. for (int i = 0; i < n_layer; ++i) {
  2564. auto & layer = layers[i];
  2565. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2566. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
  2567. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
  2568. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2569. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2570. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
  2571. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
  2572. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}, 0);
  2573. }
  2574. } break;
  2575. case LLM_ARCH_QWEN2:
  2576. case LLM_ARCH_QWEN2VL:
  2577. case LLM_ARCH_DREAM:
  2578. {
  2579. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2580. // output
  2581. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2582. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2583. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, TENSOR_NOT_REQUIRED);
  2584. // if output is NULL, init from the input tok embed
  2585. if (output == NULL) {
  2586. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2587. }
  2588. for (int i = 0; i < n_layer; ++i) {
  2589. auto & layer = layers[i];
  2590. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2591. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2592. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2593. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2594. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2595. // optional bias tensors
  2596. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2597. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2598. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2599. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2600. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2601. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2602. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2603. }
  2604. } break;
  2605. case LLM_ARCH_QWEN2MOE:
  2606. {
  2607. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2608. // output
  2609. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2610. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2611. for (int i = 0; i < n_layer; ++i) {
  2612. auto & layer = layers[i];
  2613. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2614. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2615. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2616. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2617. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2618. // optional bias tensors
  2619. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2620. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2621. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2622. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2623. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2624. if (n_expert == 0) {
  2625. throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
  2626. }
  2627. if (n_expert_used == 0) {
  2628. throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
  2629. }
  2630. // MoE branch
  2631. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2632. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2633. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2634. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2635. // Shared expert branch
  2636. const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
  2637. layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
  2638. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2639. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
  2640. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2641. }
  2642. } break;
  2643. case LLM_ARCH_QWEN3:
  2644. {
  2645. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2646. // output
  2647. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2648. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2649. // if output is NULL, init from the input tok embed
  2650. if (output == NULL) {
  2651. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2652. }
  2653. for (int i = 0; i < n_layer; ++i) {
  2654. auto & layer = layers[i];
  2655. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2656. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2657. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2658. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2659. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2660. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2661. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2662. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2663. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2664. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2665. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2666. }
  2667. } break;
  2668. case LLM_ARCH_QWEN3MOE:
  2669. {
  2670. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2671. // output
  2672. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2673. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2674. // if output is NULL, init from the input tok embed
  2675. if (output == NULL) {
  2676. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2677. }
  2678. for (int i = 0; i < n_layer; ++i) {
  2679. auto & layer = layers[i];
  2680. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2681. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2682. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2683. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2684. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2685. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2686. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2687. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2688. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2689. if (n_expert == 0) {
  2690. throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
  2691. }
  2692. if (n_expert_used == 0) {
  2693. throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
  2694. }
  2695. // MoE branch
  2696. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2697. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2698. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2699. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2700. }
  2701. } break;
  2702. case LLM_ARCH_PHI2:
  2703. {
  2704. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2705. // output
  2706. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2707. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2708. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2709. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, 0);
  2710. for (int i = 0; i < n_layer; ++i) {
  2711. auto & layer = layers[i];
  2712. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2713. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2714. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2715. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2716. if (layer.wqkv == nullptr) {
  2717. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2718. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2719. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2720. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2721. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2722. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2723. }
  2724. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2725. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2726. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2727. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2728. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2729. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2730. }
  2731. } break;
  2732. case LLM_ARCH_PHI3:
  2733. {
  2734. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2735. // output
  2736. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2737. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2738. // if output is NULL, init from the input tok embed
  2739. if (output == NULL) {
  2740. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2741. }
  2742. for (int i = 0; i < n_layer; ++i) {
  2743. auto & layer = layers[i];
  2744. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  2745. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
  2746. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  2747. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  2748. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  2749. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
  2750. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2751. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2752. }
  2753. } break;
  2754. case LLM_ARCH_PHIMOE:
  2755. {
  2756. const int64_t n_embd_head = n_embd / n_head;
  2757. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2758. // output
  2759. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2760. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2761. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
  2762. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0);
  2763. for (int i = 0; i < n_layer; ++i) {
  2764. auto & layer = layers[i];
  2765. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  2766. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
  2767. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
  2768. if (layer.wqkv == nullptr) {
  2769. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2770. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2771. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2772. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2773. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2774. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2775. }
  2776. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  2777. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
  2778. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  2779. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
  2780. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2781. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2782. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  2783. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2784. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2785. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2786. }
  2787. } break;
  2788. case LLM_ARCH_PLAMO:
  2789. {
  2790. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2791. // output
  2792. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2793. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2794. for (int i = 0; i < n_layer; ++i) {
  2795. auto & layer = layers[i];
  2796. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2797. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2798. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2799. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2800. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2801. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2802. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2803. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2804. }
  2805. } break;
  2806. case LLM_ARCH_PLAMO2:
  2807. {
  2808. const uint32_t d_conv = hparams.ssm_d_conv;
  2809. const uint32_t d_state = hparams.ssm_d_state;
  2810. const uint32_t num_heads = hparams.ssm_dt_rank;
  2811. const uint32_t intermediate_size = hparams.ssm_d_inner;
  2812. const uint32_t head_dim = intermediate_size / num_heads;
  2813. const uint32_t qk_dim = head_dim;
  2814. const uint32_t v_dim = head_dim;
  2815. const int64_t num_attention_heads = hparams.n_head();
  2816. const int64_t q_num_heads = num_attention_heads;
  2817. const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
  2818. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2819. // output
  2820. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2821. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2822. // if output is NULL, init from the input tok embed
  2823. if (output == NULL) {
  2824. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2825. }
  2826. for (int i = 0; i < n_layer; ++i) {
  2827. auto & layer = layers[i];
  2828. bool is_mamba_layer = hparams.is_recurrent(i);
  2829. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2830. if (is_mamba_layer) {
  2831. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2 * intermediate_size}, 0);
  2832. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, intermediate_size}, 0);
  2833. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {intermediate_size, dt_dim + 2*d_state}, 0);
  2834. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_dim, num_heads}, 0);
  2835. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {num_heads}, 0);
  2836. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {num_heads}, 0);
  2837. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {num_heads}, 0);
  2838. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {intermediate_size, n_embd}, 0);
  2839. layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, i), {dt_dim}, 0);
  2840. layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
  2841. layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
  2842. } else {
  2843. const int64_t num_key_value_heads = hparams.n_head_kv(i);
  2844. const int64_t k_num_heads = num_key_value_heads;
  2845. const int64_t v_num_heads = num_key_value_heads;
  2846. const int64_t q_proj_dim = q_num_heads * qk_dim;
  2847. const int64_t k_proj_dim = k_num_heads * qk_dim;
  2848. const int64_t v_proj_dim = v_num_heads * v_dim;
  2849. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
  2850. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim, num_attention_heads}, 0);
  2851. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim, k_num_heads}, 0);
  2852. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
  2853. }
  2854. // All layers have post-attention norm, FFN norm, and FFN tensors
  2855. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
  2856. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2857. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2858. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  2859. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
  2860. }
  2861. } break;
  2862. case LLM_ARCH_GPT2:
  2863. {
  2864. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2865. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  2866. // output
  2867. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2868. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2869. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2870. // if output is NULL, init from the input tok embed
  2871. if (output == NULL) {
  2872. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2873. }
  2874. for (int i = 0; i < n_layer; ++i) {
  2875. auto & layer = layers[i];
  2876. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2877. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2878. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2879. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2880. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2881. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2882. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2883. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2884. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2885. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2886. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2887. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2888. }
  2889. } break;
  2890. case LLM_ARCH_CODESHELL:
  2891. {
  2892. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2893. // if tok embd is NULL, init from output
  2894. if (tok_embd == NULL) {
  2895. tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2896. }
  2897. // output
  2898. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2899. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2900. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2901. for (int i = 0; i < n_layer; ++i) {
  2902. auto & layer = layers[i];
  2903. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2904. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2905. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2906. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2907. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2908. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2909. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2910. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2911. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2912. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2913. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2914. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2915. }
  2916. } break;
  2917. case LLM_ARCH_ORION:
  2918. {
  2919. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2920. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2921. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2922. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2923. for (int i = 0; i < n_layer; ++i) {
  2924. auto & layer = layers[i];
  2925. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2926. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2927. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2928. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2929. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2930. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2931. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2932. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2933. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2934. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2935. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2936. }
  2937. } break;
  2938. case LLM_ARCH_INTERNLM2:
  2939. {
  2940. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2941. // output
  2942. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2943. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2944. for (int i = 0; i < n_layer; ++i) {
  2945. auto & layer = layers[i];
  2946. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2947. // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2948. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2949. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2950. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2951. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2952. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2953. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2954. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2955. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2956. }
  2957. } break;
  2958. case LLM_ARCH_GEMMA:
  2959. {
  2960. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2961. // output
  2962. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2963. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  2964. for (int i = 0; i < n_layer; ++i) {
  2965. auto & layer = layers[i];
  2966. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2967. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2968. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2969. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2970. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2971. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2972. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2973. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2974. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2975. }
  2976. } break;
  2977. case LLM_ARCH_GEMMA2:
  2978. {
  2979. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2980. // output
  2981. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2982. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  2983. for (int i = 0; i < n_layer; ++i) {
  2984. auto & layer = layers[i];
  2985. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2986. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2987. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2988. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2989. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2990. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  2991. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2992. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2993. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2994. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2995. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  2996. }
  2997. } break;
  2998. case LLM_ARCH_GEMMA3:
  2999. case LLM_ARCH_GEMMA_EMBEDDING:
  3000. {
  3001. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3002. // output
  3003. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3004. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3005. // if output is NULL, init from the input tok embed
  3006. if (output == NULL) {
  3007. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3008. }
  3009. for (int i = 0; i < n_layer; ++i) {
  3010. auto & layer = layers[i];
  3011. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3012. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3013. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3014. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3015. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3016. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3017. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  3018. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  3019. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3020. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3021. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3022. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3023. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3024. }
  3025. } break;
  3026. case LLM_ARCH_GEMMA3N:
  3027. {
  3028. const int64_t n_altup = hparams.n_altup;
  3029. const int64_t laurel_rank = hparams.laurel_rank;
  3030. const int64_t n_embd_altup = hparams.n_embd_altup;
  3031. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3032. // if output is NULL, init from the input tok embed
  3033. if (output == NULL) {
  3034. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3035. }
  3036. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3037. tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0);
  3038. altup_proj = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
  3039. altup_unembd_proj = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
  3040. per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_altup * n_layer}, 0);
  3041. per_layer_proj_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM, "weight"), {n_embd_altup}, 0);
  3042. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3043. for (int i = 0; i < n_layer; ++i) {
  3044. auto & layer = layers[i];
  3045. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3046. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3047. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3048. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3049. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3050. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  3051. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  3052. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3053. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3054. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3055. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3056. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3057. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3058. // altup & laurel
  3059. layer.per_layer_inp_gate = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE, "weight", i), {n_embd, n_embd_altup}, 0);
  3060. layer.per_layer_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ, "weight", i), {n_embd_altup, n_embd}, 0);
  3061. layer.per_layer_post_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
  3062. layer.altup_correct_coef = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF, "weight", i), {n_altup, n_altup}, 0);
  3063. layer.altup_correct_scale = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0);
  3064. layer.altup_predict_coef = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF, "weight", i), {n_altup, n_altup * n_altup}, 0);
  3065. layer.altup_router = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER, "weight", i), {n_embd, n_altup}, 0);
  3066. layer.altup_router_norm = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM, "weight", i), {n_embd}, 0);
  3067. layer.laurel_l = create_tensor(tn(LLM_TENSOR_LAUREL_L, "weight", i), {n_embd, laurel_rank}, 0);
  3068. layer.laurel_r = create_tensor(tn(LLM_TENSOR_LAUREL_R, "weight", i), {laurel_rank, n_embd}, 0);
  3069. layer.laurel_post_norm = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM, "weight", i), {n_embd}, 0);
  3070. }
  3071. } break;
  3072. case LLM_ARCH_STARCODER2:
  3073. {
  3074. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3075. // output
  3076. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3077. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3078. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3079. // if output is NULL, init from the input tok embed
  3080. if (output == NULL) {
  3081. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3082. }
  3083. for (int i = 0; i < n_layer; ++i) {
  3084. auto & layer = layers[i];
  3085. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3086. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3087. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3088. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3089. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3090. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3091. // optional bias tensors
  3092. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  3093. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  3094. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  3095. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3096. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3097. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3098. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3099. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3100. // optional bias tensors
  3101. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3102. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}, 0);
  3103. }
  3104. } break;
  3105. case LLM_ARCH_MAMBA:
  3106. {
  3107. const int64_t d_conv = hparams.ssm_d_conv;
  3108. const int64_t d_inner = hparams.ssm_d_inner;
  3109. const int64_t d_state = hparams.ssm_d_state;
  3110. const int64_t dt_rank = hparams.ssm_dt_rank;
  3111. // only an expansion factor of 2 is supported for now
  3112. if (2 * n_embd != d_inner) {
  3113. throw std::runtime_error("only an expansion factor of 2 is supported for now");
  3114. }
  3115. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3116. // output
  3117. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3118. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3119. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3120. if (output == NULL) {
  3121. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3122. }
  3123. for (int i = 0; i < n_layer; ++i) {
  3124. auto & layer = layers[i];
  3125. // norm
  3126. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3127. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
  3128. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
  3129. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
  3130. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
  3131. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
  3132. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
  3133. // no "weight" suffix for these
  3134. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
  3135. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
  3136. // out_proj
  3137. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3138. }
  3139. } break;
  3140. case LLM_ARCH_MAMBA2:
  3141. {
  3142. const int64_t d_conv = hparams.ssm_d_conv;
  3143. const int64_t d_inner = hparams.ssm_d_inner;
  3144. const int64_t d_state = hparams.ssm_d_state;
  3145. const int64_t n_head = hparams.ssm_dt_rank;
  3146. const int64_t n_group = hparams.ssm_n_group;
  3147. const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
  3148. // only an expansion factor of 2 is supported for now
  3149. GGML_ASSERT(2 * n_embd == d_inner);
  3150. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3151. // output
  3152. {
  3153. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3154. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3155. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3156. if (output == NULL) {
  3157. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3158. }
  3159. }
  3160. for (int i = 0; i < n_layer; ++i) {
  3161. auto & layer = layers[i];
  3162. // norm
  3163. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3164. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
  3165. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
  3166. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);
  3167. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
  3168. // no "weight" suffix for these
  3169. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
  3170. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
  3171. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
  3172. // out_proj
  3173. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3174. }
  3175. } break;
  3176. case LLM_ARCH_JAMBA:
  3177. {
  3178. const int64_t d_conv = hparams.ssm_d_conv;
  3179. const int64_t d_inner = hparams.ssm_d_inner;
  3180. const int64_t d_state = hparams.ssm_d_state;
  3181. const int64_t dt_rank = hparams.ssm_dt_rank;
  3182. // only an expansion factor of 2 is supported for now
  3183. GGML_ASSERT(2 * n_embd == d_inner);
  3184. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3185. // output
  3186. {
  3187. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3188. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3189. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3190. if (output == NULL) {
  3191. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3192. }
  3193. }
  3194. for (int i = 0; i < n_layer; ++i) {
  3195. const int64_t n_head_kv = hparams.n_head_kv(i);
  3196. const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
  3197. auto & layer = layers[i];
  3198. // norm
  3199. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3200. if (n_head_kv == 0) {
  3201. // Mamba layer
  3202. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
  3203. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
  3204. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
  3205. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
  3206. layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, "weight", i), {dt_rank}, 0);
  3207. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
  3208. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
  3209. layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, "weight", i), {d_state}, 0);
  3210. layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, "weight", i), {d_state}, 0);
  3211. // no "weight" suffix for these
  3212. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
  3213. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
  3214. // out_proj
  3215. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3216. } else {
  3217. // Attention layers
  3218. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3219. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3220. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3221. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3222. }
  3223. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3224. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
  3225. if (layer.ffn_gate_inp) {
  3226. // MoE
  3227. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3228. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  3229. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3230. } else {
  3231. // FFN (no MoE)
  3232. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3233. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3234. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3235. }
  3236. }
  3237. } break;
  3238. case LLM_ARCH_GRANITE_HYBRID:
  3239. {
  3240. // mamba2 Mixer SSM params
  3241. // NOTE: int64_t for tensor dimensions
  3242. const int64_t d_conv = hparams.ssm_d_conv;
  3243. const int64_t d_inner = hparams.ssm_d_inner;
  3244. const int64_t d_state = hparams.ssm_d_state;
  3245. const int64_t n_ssm_head = hparams.ssm_dt_rank;
  3246. const int64_t n_group = hparams.ssm_n_group;
  3247. const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
  3248. // only an expansion factor of 2 is supported for now
  3249. GGML_ASSERT(2 * n_embd == d_inner);
  3250. // embeddings
  3251. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3252. // output
  3253. {
  3254. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3255. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3256. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3257. if (output == NULL) {
  3258. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3259. }
  3260. }
  3261. for (int i = 0; i < n_layer; ++i) {
  3262. auto & layer = layers[i];
  3263. // norm
  3264. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3265. if (hparams.is_recurrent(i)) {
  3266. // ssm layers
  3267. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
  3268. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
  3269. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
  3270. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
  3271. // no "weight" suffix for these
  3272. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
  3273. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
  3274. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
  3275. // out_proj
  3276. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3277. } else {
  3278. // attention layers (with optional bias)
  3279. const int64_t n_head_i = hparams.n_head(i);
  3280. const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
  3281. const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
  3282. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
  3283. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
  3284. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
  3285. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
  3286. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3287. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
  3288. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
  3289. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3290. }
  3291. // feed forward (w/ optional biases)
  3292. if (n_expert > 0) {
  3293. // MoE FFN
  3294. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3295. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3296. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3297. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
  3298. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  3299. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3300. // For Granite MoE Shared
  3301. if (hparams.n_ff_shexp > 0) {
  3302. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  3303. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  3304. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
  3305. }
  3306. } else {
  3307. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3308. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3309. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3310. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3311. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3312. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  3313. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3314. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  3315. }
  3316. }
  3317. } break;
  3318. case LLM_ARCH_XVERSE:
  3319. {
  3320. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3321. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3322. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3323. for (int i = 0; i < n_layer; ++i) {
  3324. auto & layer = layers[i];
  3325. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3326. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3327. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3328. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3329. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3330. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3331. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3332. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3333. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3334. }
  3335. } break;
  3336. case LLM_ARCH_COMMAND_R:
  3337. {
  3338. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3339. // output
  3340. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3341. // init output from the input tok embed
  3342. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3343. for (int i = 0; i < n_layer; ++i) {
  3344. auto & layer = layers[i];
  3345. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3346. if (n_layer >= 64){
  3347. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
  3348. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
  3349. }
  3350. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3351. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3352. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3353. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3354. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3355. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3356. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3357. }
  3358. } break;
  3359. case LLM_ARCH_COHERE2:
  3360. {
  3361. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  3362. // output
  3363. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  3364. // init output from the input tok embed
  3365. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
  3366. TENSOR_DUPLICATED);
  3367. for (int i = 0; i < n_layer; ++i) {
  3368. auto & layer = layers[i];
  3369. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  3370. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
  3371. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
  3372. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
  3373. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  3374. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
  3375. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  3376. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
  3377. }
  3378. }
  3379. break;
  3380. case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
  3381. {
  3382. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3383. // output
  3384. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3385. // if output is NULL, init from the input tok embed
  3386. if (output == NULL) {
  3387. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3388. }
  3389. for (int i = 0; i < n_layer; ++i) {
  3390. auto & layer = layers[i];
  3391. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3392. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3393. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3394. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3395. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3396. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3397. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3398. }
  3399. } break;
  3400. case LLM_ARCH_OLMO2:
  3401. {
  3402. const int64_t n_embd_head = n_embd / n_head;
  3403. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3404. // output
  3405. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3406. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3407. for (int i = 0; i < n_layer; ++i) {
  3408. auto & layer = layers[i];
  3409. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3410. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3411. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3412. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3413. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
  3414. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
  3415. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3416. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3417. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3418. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3419. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3420. }
  3421. } break;
  3422. case LLM_ARCH_SEED_OSS:
  3423. {
  3424. const uint32_t head_dim = hparams.n_embd_head_k;
  3425. const int64_t n_qo_dim = n_head * head_dim;
  3426. const int64_t n_kv_dim = n_head_kv * head_dim;
  3427. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3428. // output
  3429. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3430. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3431. // if output is NULL, init from the input tok embed
  3432. if (output == NULL) {
  3433. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3434. }
  3435. for (int i = 0; i < n_layer; ++i) {
  3436. auto & layer = layers[i];
  3437. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, 0);
  3438. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, 0);
  3439. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, 0);
  3440. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
  3441. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_qo_dim}, TENSOR_NOT_REQUIRED);
  3442. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
  3443. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
  3444. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3445. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3446. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3447. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3448. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3449. }
  3450. } break;
  3451. case LLM_ARCH_OLMOE:
  3452. {
  3453. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3454. // output
  3455. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3456. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3457. for (int i = 0; i < n_layer; ++i) {
  3458. auto & layer = layers[i];
  3459. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3460. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3461. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3462. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3463. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3464. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
  3465. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
  3466. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3467. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3468. if (n_expert == 0) {
  3469. throw std::runtime_error("n_expert must be > 0");
  3470. }
  3471. if (n_expert_used == 0) {
  3472. throw std::runtime_error("n_expert_used must be > 0");
  3473. }
  3474. // MoE branch
  3475. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3476. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  3477. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3478. }
  3479. } break;
  3480. case LLM_ARCH_OPENELM:
  3481. {
  3482. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3483. // output
  3484. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3485. // init output from the input tok embed
  3486. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3487. for (int i = 0; i < n_layer; ++i) {
  3488. const int64_t n_head = hparams.n_head(i);
  3489. const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head;
  3490. const int64_t n_ff = hparams.n_ff(i);
  3491. auto & layer = layers[i];
  3492. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3493. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
  3494. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  3495. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  3496. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
  3497. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3498. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3499. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3500. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3501. }
  3502. } break;
  3503. case LLM_ARCH_GPTNEOX:
  3504. {
  3505. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3506. // output
  3507. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3508. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3509. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3510. for (int i = 0; i < n_layer; ++i) {
  3511. auto & layer = layers[i];
  3512. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3513. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3514. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3515. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  3516. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3517. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3518. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3519. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3520. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3521. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3522. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3523. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  3524. }
  3525. } break;
  3526. case LLM_ARCH_ARCTIC:
  3527. {
  3528. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3529. // output
  3530. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3531. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3532. // if output is NULL, init from the input tok embed
  3533. if (output == NULL) {
  3534. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3535. }
  3536. for (int i = 0; i < n_layer; ++i) {
  3537. auto & layer = layers[i];
  3538. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3539. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3540. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3541. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3542. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3543. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3544. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
  3545. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
  3546. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd}, 0);
  3547. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3548. layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
  3549. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
  3550. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  3551. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3552. }
  3553. } break;
  3554. case LLM_ARCH_DEEPSEEK:
  3555. {
  3556. const int64_t n_ff_exp = hparams.n_ff_exp;
  3557. const int64_t n_expert_shared = hparams.n_expert_shared;
  3558. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3559. // output
  3560. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3561. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3562. for (int i = 0; i < n_layer; ++i) {
  3563. auto & layer = layers[i];
  3564. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3565. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3566. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3567. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3568. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3569. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3570. if (i < (int) hparams.n_layer_dense_lead) {
  3571. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3572. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3573. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3574. } else {
  3575. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3576. if (n_expert == 0) {
  3577. throw std::runtime_error("n_expert must be > 0");
  3578. }
  3579. if (n_expert_used == 0) {
  3580. throw std::runtime_error("n_expert_used must be > 0");
  3581. }
  3582. // MoE branch
  3583. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3584. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  3585. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3586. // Shared expert branch
  3587. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3588. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  3589. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3590. }
  3591. }
  3592. } break;
  3593. case LLM_ARCH_DEEPSEEK2:
  3594. {
  3595. const bool is_lite = (hparams.n_layer == 27);
  3596. const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
  3597. // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
  3598. const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
  3599. const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
  3600. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  3601. const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
  3602. const int64_t q_lora_rank = hparams.n_lora_q;
  3603. const int64_t kv_lora_rank = hparams.n_lora_kv;
  3604. const int64_t n_ff_exp = hparams.n_ff_exp;
  3605. const int64_t n_expert_shared = hparams.n_expert_shared;
  3606. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3607. // output
  3608. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3609. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3610. for (int i = 0; i < n_layer; ++i) {
  3611. auto & layer = layers[i];
  3612. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3613. if (!is_lite) {
  3614. layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
  3615. }
  3616. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  3617. if (!is_lite) {
  3618. layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
  3619. layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
  3620. } else {
  3621. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
  3622. }
  3623. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
  3624. // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
  3625. if (is_mla) {
  3626. layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
  3627. layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
  3628. } else {
  3629. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
  3630. }
  3631. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
  3632. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3633. if (i < (int) hparams.n_layer_dense_lead) {
  3634. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3635. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3636. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3637. } else {
  3638. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3639. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  3640. if (n_expert == 0) {
  3641. throw std::runtime_error("n_expert must be > 0");
  3642. }
  3643. if (n_expert_used == 0) {
  3644. throw std::runtime_error("n_expert_used must be > 0");
  3645. }
  3646. // MoE branch
  3647. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3648. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  3649. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3650. // Shared expert branch
  3651. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3652. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  3653. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3654. }
  3655. }
  3656. } break;
  3657. case LLM_ARCH_PLM:
  3658. {
  3659. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  3660. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  3661. const int64_t kv_lora_rank = hparams.n_lora_kv;
  3662. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3663. // output
  3664. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3665. // output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3666. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3667. for (int i = 0; i < n_layer; ++i) {
  3668. auto & layer = layers[i];
  3669. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3670. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3671. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
  3672. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  3673. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
  3674. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
  3675. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3676. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3677. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3678. }
  3679. } break;
  3680. case LLM_ARCH_BITNET:
  3681. {
  3682. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3683. // output
  3684. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3685. for (int i = 0; i < n_layer; ++i) {
  3686. auto & layer = layers[i];
  3687. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3688. layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
  3689. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3690. layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3691. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3692. layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3693. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3694. layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3695. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3696. layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3697. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3698. layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
  3699. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3700. layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3701. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3702. layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3703. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3704. layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3705. }
  3706. } break;
  3707. case LLM_ARCH_T5:
  3708. {
  3709. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  3710. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3711. // output
  3712. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3713. output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3714. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3715. // if output is NULL, init from the input tok embed
  3716. if (output == NULL) {
  3717. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3718. }
  3719. for (int i = 0; i < n_layer; ++i) {
  3720. auto & layer = layers[i];
  3721. layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
  3722. layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  3723. layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3724. layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3725. layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3726. layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  3727. layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
  3728. layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  3729. layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3730. layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3731. layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
  3732. layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  3733. layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3734. layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3735. layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3736. layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  3737. layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
  3738. // this tensor seems to be unused in HF transformers implementation
  3739. layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  3740. layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3741. layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3742. layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3743. layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  3744. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
  3745. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  3746. layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3747. layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3748. }
  3749. } break;
  3750. case LLM_ARCH_T5ENCODER:
  3751. {
  3752. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  3753. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3754. // output
  3755. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3756. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3757. // if output is NULL, init from the input tok embed
  3758. if (output == NULL) {
  3759. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3760. }
  3761. for (int i = 0; i < n_layer; ++i) {
  3762. auto & layer = layers[i];
  3763. layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
  3764. layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  3765. layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3766. layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3767. layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3768. layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  3769. layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
  3770. layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  3771. layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3772. layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3773. }
  3774. } break;
  3775. case LLM_ARCH_JAIS:
  3776. {
  3777. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3778. // output
  3779. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3780. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3781. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3782. for (int i = 0; i < n_layer; ++i) {
  3783. auto & layer = layers[i];
  3784. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3785. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3786. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3787. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  3788. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3789. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3790. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3791. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3792. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3793. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3794. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3795. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, 0);
  3796. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3797. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  3798. }
  3799. } break;
  3800. case LLM_ARCH_CHATGLM:
  3801. {
  3802. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3803. // output
  3804. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3805. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3806. // if output is NULL, init from the input tok embed
  3807. if (output == NULL) {
  3808. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3809. }
  3810. for (int i = 0; i < n_layer; ++i) {
  3811. auto & layer = layers[i];
  3812. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3813. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3814. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3815. if (layer.wqkv == nullptr) {
  3816. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3817. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3818. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3819. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3820. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3821. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3822. }
  3823. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3824. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3825. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  3826. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3827. }
  3828. } break;
  3829. case LLM_ARCH_GLM4:
  3830. {
  3831. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3832. // output
  3833. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3834. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3835. // if output is NULL, init from the input tok embed
  3836. if (output == NULL) {
  3837. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3838. }
  3839. for (int i = 0; i < n_layer; ++i) {
  3840. auto & layer = layers[i];
  3841. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3842. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3843. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3844. if (layer.wqkv == nullptr) {
  3845. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3846. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3847. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3848. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3849. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3850. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3851. }
  3852. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3853. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3854. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3855. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3856. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  3857. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3858. }
  3859. } break;
  3860. case LLM_ARCH_GLM4_MOE:
  3861. {
  3862. const int64_t n_expert = hparams.n_expert;
  3863. const int64_t n_expert_used = hparams.n_expert_used;
  3864. const int64_t n_expert_shared = hparams.n_expert_shared;
  3865. GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
  3866. GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
  3867. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  3868. // output
  3869. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  3870. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
  3871. // if output is NULL, init from the input tok embed
  3872. if (output == NULL) {
  3873. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
  3874. }
  3875. // Load ALL tensors including NextN layer to satisfy total tensor count
  3876. // but only PROCESS up to last layer (skipping final NextN layer) in forward pass
  3877. for (int i = 0; i < n_layer; ++i) {
  3878. int flags = 0;
  3879. if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
  3880. // skip all tensors in the NextN layers
  3881. flags |= TENSOR_SKIP;
  3882. }
  3883. auto & layer = layers[i];
  3884. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
  3885. // GLM-style attention with bias terms
  3886. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
  3887. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
  3888. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
  3889. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
  3890. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
  3891. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
  3892. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
  3893. // K/Q norm tensors (optional for GLM-4.5 355B variant)
  3894. layer.attn_q_norm = create_tensor(
  3895. tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
  3896. layer.attn_k_norm = create_tensor(
  3897. tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
  3898. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
  3899. // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
  3900. // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
  3901. const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
  3902. if (use_moe) {
  3903. // MoE layers
  3904. layer.ffn_gate_inp =
  3905. create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
  3906. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
  3907. // MoE branch
  3908. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  3909. layer.ffn_gate_exps = create_tensor(
  3910. tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
  3911. layer.ffn_down_exps = create_tensor(
  3912. tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
  3913. layer.ffn_up_exps = create_tensor(
  3914. tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
  3915. // Shared expert
  3916. if (n_expert_shared > 0) {
  3917. const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
  3918. layer.ffn_gate_shexp = create_tensor(
  3919. tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
  3920. layer.ffn_down_shexp = create_tensor(
  3921. tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
  3922. layer.ffn_up_shexp = create_tensor(
  3923. tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
  3924. }
  3925. } else {
  3926. // Dense layers (first k layers) - GLM uses separate gate/up projections
  3927. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
  3928. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
  3929. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags);
  3930. }
  3931. // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
  3932. if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
  3933. layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
  3934. layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags);
  3935. layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
  3936. layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
  3937. layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags);
  3938. layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags);
  3939. }
  3940. }
  3941. }
  3942. break;
  3943. case LLM_ARCH_NEMOTRON:
  3944. {
  3945. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3946. // output
  3947. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3948. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3949. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3950. for (int i = 0; i < n_layer; ++i) {
  3951. auto & layer = layers[i];
  3952. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3953. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3954. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3955. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3956. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3957. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3958. // optional bias tensors
  3959. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3960. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3961. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3962. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3963. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3964. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3965. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3966. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3967. // optional MLP bias
  3968. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3969. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  3970. }
  3971. } break;
  3972. case LLM_ARCH_NEMOTRON_H:
  3973. {
  3974. // mamba2 Mixer SSM params
  3975. // NOTE: int64_t for tensor dimensions
  3976. const int64_t d_conv = hparams.ssm_d_conv;
  3977. const int64_t d_inner = hparams.ssm_d_inner;
  3978. const int64_t d_state = hparams.ssm_d_state;
  3979. const int64_t n_ssm_head = hparams.ssm_dt_rank;
  3980. const int64_t n_group = hparams.ssm_n_group;
  3981. const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
  3982. // embeddings
  3983. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3984. // output
  3985. {
  3986. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3987. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3988. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3989. if (output == NULL) {
  3990. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3991. }
  3992. }
  3993. for (int i = 0; i < n_layer; ++i) {
  3994. auto & layer = layers[i];
  3995. // all blocks use the attn norm
  3996. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3997. if (hparams.is_recurrent(i)) {
  3998. // ssm layers
  3999. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
  4000. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
  4001. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
  4002. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
  4003. // no "weight" suffix for these
  4004. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
  4005. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
  4006. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
  4007. // out_proj
  4008. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  4009. } else if (hparams.n_ff(i) == 0) {
  4010. // attention layers (with optional bias)
  4011. const int64_t n_head_i = hparams.n_head(i);
  4012. const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
  4013. const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
  4014. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
  4015. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
  4016. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
  4017. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
  4018. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4019. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
  4020. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
  4021. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4022. } else {
  4023. // mlp layers
  4024. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
  4025. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
  4026. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4027. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
  4028. }
  4029. }
  4030. } break;
  4031. case LLM_ARCH_EXAONE:
  4032. {
  4033. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4034. // output
  4035. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4036. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4037. // if output is NULL, init from the input tok embed
  4038. if (output == NULL) {
  4039. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4040. }
  4041. for (int i = 0; i < n_layer; ++i) {
  4042. auto & layer = layers[i];
  4043. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4044. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4045. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4046. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4047. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4048. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4049. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4050. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4051. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4052. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4053. }
  4054. } break;
  4055. case LLM_ARCH_EXAONE4:
  4056. {
  4057. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4058. // output
  4059. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4060. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4061. // if output is NULL, init from the input tok embed
  4062. if (output == NULL) {
  4063. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4064. }
  4065. for (int i = 0; i < n_layer; ++i) {
  4066. auto & layer = layers[i];
  4067. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4068. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4069. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4070. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4071. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4072. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  4073. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4074. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4075. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4076. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4077. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4078. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  4079. }
  4080. } break;
  4081. case LLM_ARCH_RWKV6:
  4082. {
  4083. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4084. // Block 0, LN0
  4085. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  4086. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  4087. // output
  4088. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4089. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4090. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4091. const int time_mix_extra_dim = hparams.time_mix_extra_dim;
  4092. const int time_decay_extra_dim = hparams.time_decay_extra_dim;
  4093. const int head_size = hparams.wkv_head_size;
  4094. const int attn_hidden_size = n_embd;
  4095. const int ffn_size = hparams.n_ff_arr[0];
  4096. for (int i = 0; i < n_layer; ++i) {
  4097. auto & layer = layers[i];
  4098. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4099. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  4100. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
  4101. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
  4102. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
  4103. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
  4104. layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
  4105. layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4106. layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4107. layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4108. layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4109. layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4110. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED);
  4111. GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
  4112. layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
  4113. layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
  4114. layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
  4115. layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
  4116. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  4117. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4118. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4119. layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4120. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
  4121. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
  4122. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4123. layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
  4124. layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
  4125. layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
  4126. layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
  4127. layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
  4128. }
  4129. } break;
  4130. case LLM_ARCH_RWKV6QWEN2:
  4131. {
  4132. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4133. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4134. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  4135. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4136. const int time_mix_extra_dim = hparams.time_mix_extra_dim;
  4137. const int time_decay_extra_dim = hparams.time_decay_extra_dim;
  4138. const int head_size = hparams.wkv_head_size;
  4139. const int attn_hidden_size = n_embd;
  4140. const int n_head_kv = hparams.n_head_kv();
  4141. int attn_key_value_size;
  4142. if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
  4143. attn_key_value_size = attn_hidden_size;
  4144. } else {
  4145. attn_key_value_size = n_head_kv * head_size;
  4146. }
  4147. for (int i = 0; i < n_layer; ++i) {
  4148. auto & layer = layers[i];
  4149. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4150. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
  4151. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
  4152. layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
  4153. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
  4154. layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, TENSOR_NOT_REQUIRED);
  4155. layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
  4156. layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
  4157. layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
  4158. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
  4159. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
  4160. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4161. layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4162. // optional bias tensors
  4163. layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
  4164. layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
  4165. layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, TENSOR_NOT_REQUIRED);
  4166. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4167. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4168. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4169. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4170. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4171. }
  4172. } break;
  4173. case LLM_ARCH_RWKV7:
  4174. {
  4175. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4176. // Block 0, LN0
  4177. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  4178. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  4179. // output
  4180. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4181. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4182. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4183. const int n_lora_decay = hparams.n_lora_decay;
  4184. const int n_lora_iclr = hparams.n_lora_iclr;
  4185. const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
  4186. const int n_lora_gate = hparams.n_lora_gate;
  4187. const int attn_hidden_size = n_embd;
  4188. const int ffn_size = hparams.n_ff_arr[0];
  4189. for (int i = 0; i < n_layer; ++i) {
  4190. auto & layer = layers[i];
  4191. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4192. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  4193. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
  4194. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
  4195. layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
  4196. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
  4197. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
  4198. layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
  4199. layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4200. layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4201. if (i == 0) {
  4202. // actually not used
  4203. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4204. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4205. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4206. } else {
  4207. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4208. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
  4209. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
  4210. }
  4211. layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, 0);
  4212. layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, 0);
  4213. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
  4214. layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
  4215. layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
  4216. layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
  4217. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  4218. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4219. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4220. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
  4221. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
  4222. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4223. layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
  4224. layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
  4225. layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
  4226. }
  4227. } break;
  4228. case LLM_ARCH_ARWKV7:
  4229. {
  4230. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4231. // output
  4232. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4233. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4234. const int n_lora_decay = hparams.n_lora_decay;
  4235. const int n_lora_iclr = hparams.n_lora_iclr;
  4236. const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
  4237. const int n_lora_gate = hparams.n_lora_gate;
  4238. const int attn_hidden_size = n_embd;
  4239. for (int i = 0; i < n_layer; ++i) {
  4240. auto & layer = layers[i];
  4241. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4242. layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
  4243. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
  4244. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
  4245. layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
  4246. layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4247. layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4248. if (i == 0) {
  4249. // actually not used
  4250. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4251. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4252. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4253. } else {
  4254. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4255. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
  4256. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
  4257. }
  4258. layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, TENSOR_NOT_REQUIRED);
  4259. layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, TENSOR_NOT_REQUIRED);
  4260. try {
  4261. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
  4262. } catch(std::runtime_error & e) {
  4263. // ARWKV models may not have gate tensors
  4264. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
  4265. }
  4266. layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
  4267. layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
  4268. layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
  4269. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  4270. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4271. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4272. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4273. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4274. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4275. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4276. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4277. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4278. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4279. }
  4280. } break;
  4281. case LLM_ARCH_CHAMELEON:
  4282. {
  4283. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4284. // output
  4285. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4286. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4287. // if output is NULL, init from the input tok embed
  4288. if (output == NULL) {
  4289. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4290. }
  4291. for (int i = 0; i < n_layer; ++i) {
  4292. auto & layer = layers[i];
  4293. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4294. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
  4295. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
  4296. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
  4297. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
  4298. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  4299. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  4300. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  4301. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4302. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4303. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4304. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4305. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4306. }
  4307. } break;
  4308. case LLM_ARCH_WAVTOKENIZER_DEC:
  4309. {
  4310. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
  4311. conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
  4312. conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
  4313. // posnet
  4314. {
  4315. const int64_t n_embd = hparams.posnet.n_embd;
  4316. for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
  4317. auto & layer = layers[i].posnet;
  4318. // posnet:
  4319. //
  4320. // - resnet
  4321. // - resnet
  4322. // - attn
  4323. // - resnet
  4324. // - resnet
  4325. // - norm
  4326. //
  4327. switch (i) {
  4328. case 0:
  4329. case 1:
  4330. case 3:
  4331. case 4:
  4332. {
  4333. layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
  4334. layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
  4335. layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
  4336. layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
  4337. layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
  4338. layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
  4339. layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
  4340. layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
  4341. } break;
  4342. case 2:
  4343. {
  4344. layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
  4345. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
  4346. layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
  4347. layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
  4348. layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
  4349. layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
  4350. layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
  4351. layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
  4352. layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
  4353. layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
  4354. } break;
  4355. case 5:
  4356. {
  4357. layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
  4358. layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
  4359. } break;
  4360. default: GGML_ABORT("unknown posnet layer");
  4361. };
  4362. }
  4363. }
  4364. GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
  4365. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
  4366. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
  4367. // convnext
  4368. {
  4369. const int64_t n_embd = hparams.convnext.n_embd;
  4370. for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
  4371. auto & layer = layers[i].convnext;
  4372. layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
  4373. layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
  4374. layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
  4375. layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
  4376. layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
  4377. layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
  4378. layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
  4379. layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
  4380. layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
  4381. }
  4382. // output
  4383. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4384. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4385. }
  4386. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
  4387. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
  4388. } break;
  4389. case LLM_ARCH_BAILINGMOE:
  4390. {
  4391. const int64_t n_ff_exp = hparams.n_ff_exp;
  4392. const int64_t n_expert_shared = hparams.n_expert_shared;
  4393. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4394. // output
  4395. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4396. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4397. for (int i = 0; i < n_layer; ++i) {
  4398. auto & layer = layers[i];
  4399. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4400. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
  4401. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  4402. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  4403. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
  4404. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4405. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4406. if (n_expert == 0) {
  4407. throw std::runtime_error("n_expert must be > 0");
  4408. }
  4409. if (n_expert_used == 0) {
  4410. throw std::runtime_error("n_expert_used must be > 0");
  4411. }
  4412. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4413. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  4414. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4415. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4416. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  4417. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4418. }
  4419. } break;
  4420. case LLM_ARCH_DOTS1:
  4421. {
  4422. const int64_t n_ff_exp = hparams.n_ff_exp;
  4423. const int64_t n_expert_shared = hparams.n_expert_shared;
  4424. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4425. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4426. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4427. for (int i = 0; i < n_layer; ++i) {
  4428. auto & layer = layers[i];
  4429. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4430. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4431. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4432. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4433. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4434. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4435. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4436. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4437. if (i < (int) hparams.n_layer_dense_lead) {
  4438. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4439. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4440. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4441. } else {
  4442. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4443. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  4444. if (n_expert == 0) {
  4445. throw std::runtime_error("n_expert must be > 0");
  4446. }
  4447. if (n_expert_used == 0) {
  4448. throw std::runtime_error("n_expert_used must be > 0");
  4449. }
  4450. // MoE branch
  4451. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4452. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  4453. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4454. // Shared expert branch
  4455. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4456. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  4457. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4458. }
  4459. }
  4460. } break;
  4461. case LLM_ARCH_ARCEE:
  4462. {
  4463. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4464. // output
  4465. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4466. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4467. // if output is NULL, init from the input tok embed
  4468. if (output == NULL) {
  4469. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4470. }
  4471. for (int i = 0; i < n_layer; ++i) {
  4472. auto & layer = layers[i];
  4473. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4474. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4475. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4476. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4477. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4478. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4479. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4480. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4481. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4482. }
  4483. } break;
  4484. case LLM_ARCH_ERNIE4_5:
  4485. case LLM_ARCH_ERNIE4_5_MOE:
  4486. {
  4487. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4488. // output
  4489. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4490. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4491. // if output is NULL, init from the input tok embed
  4492. if (output == NULL) {
  4493. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4494. }
  4495. for (int i = 0; i < n_layer; ++i) {
  4496. auto & layer = layers[i];
  4497. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4498. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4499. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  4500. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  4501. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4502. // optional bias tensors
  4503. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4504. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4505. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4506. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4507. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4508. if (arch == LLM_ARCH_ERNIE4_5_MOE && static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
  4509. int n_ff_exp = hparams.n_ff_exp;
  4510. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4511. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  4512. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
  4513. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
  4514. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  4515. // Shared expert (if present)
  4516. if (hparams.n_ff_shexp > 0) {
  4517. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
  4518. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd }, 0);
  4519. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
  4520. }
  4521. } else { // Dense layers
  4522. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4523. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4524. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4525. }
  4526. }
  4527. } break;
  4528. case LLM_ARCH_FALCON_H1:
  4529. {
  4530. // Common
  4531. const int64_t hidden_size = hparams.n_embd; // hidden_size
  4532. // mamba2 Mixer SSM params
  4533. const int64_t ssm_conv_kernel_size = hparams.ssm_d_conv; // ssm_conv_kernel_size
  4534. const int64_t ssm_n_groups = hparams.ssm_n_group; // ssm_n_groups
  4535. const int64_t ssm_state_size = hparams.ssm_d_state; // ssm_state_size
  4536. const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand
  4537. const int64_t ssm_num_heads = hparams.ssm_dt_rank; // ssm_num_heads
  4538. const int64_t ssm_conv_dim = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size;
  4539. const int64_t ssm_projection_size = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads;
  4540. // attn params
  4541. const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head
  4542. const int64_t attn_num_key_value_head = hparams.n_head_kv(0);
  4543. // ffn params
  4544. const int64_t ffn_intermediate_size = hparams.n_ff(0);
  4545. // embeddings
  4546. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0);
  4547. // output
  4548. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED);
  4549. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0);
  4550. // if output is NULL, init from the input tok embed
  4551. if (output == NULL) {
  4552. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED);
  4553. }
  4554. for (int i = 0; i < n_layer; ++i) {
  4555. auto & layer = layers[i];
  4556. /*SSM LAYERS*/
  4557. // ssm in
  4558. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0);
  4559. // ssm 1d conv
  4560. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0);
  4561. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED);
  4562. // ssm_dt
  4563. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0);
  4564. // no "weight" suffix for these
  4565. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0);
  4566. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0);
  4567. // ssm_norm
  4568. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED);
  4569. // out_proj
  4570. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0);
  4571. /*ATTENTION LAYERS*/
  4572. // attention layers (with optional bias)
  4573. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {hidden_size, n_embd_head_k * attn_num_attention_head}, 0);
  4574. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0);
  4575. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0);
  4576. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
  4577. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
  4578. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {attn_num_key_value_head * n_embd_head_k}, TENSOR_NOT_REQUIRED);
  4579. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {attn_num_key_value_head * n_embd_head_v}, TENSOR_NOT_REQUIRED);
  4580. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
  4581. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
  4582. // feed forward (w/ optional biases)
  4583. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0);
  4584. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4585. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
  4586. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_intermediate_size, hidden_size}, 0);
  4587. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
  4588. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
  4589. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
  4590. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
  4591. }
  4592. } break;
  4593. case LLM_ARCH_HUNYUAN_MOE:
  4594. {
  4595. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4596. // output
  4597. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4598. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4599. // if output is NULL, init from the input tok embed
  4600. if (output == NULL) {
  4601. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4602. }
  4603. for (int i = 0; i < n_layer; ++i) {
  4604. auto & layer = layers[i];
  4605. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4606. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4607. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4608. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4609. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4610. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4611. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4612. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4613. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4614. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  4615. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  4616. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  4617. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  4618. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  4619. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
  4620. }
  4621. } break;
  4622. case LLM_ARCH_HUNYUAN_DENSE:
  4623. {
  4624. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4625. // output
  4626. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4627. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4628. // if output is NULL, init from the input tok embed
  4629. if (output == NULL) {
  4630. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4631. }
  4632. for (int i = 0; i < n_layer; ++i) {
  4633. auto & layer = layers[i];
  4634. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4635. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4636. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4637. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4638. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4639. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4640. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4641. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4642. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4643. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4644. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4645. }
  4646. } break;
  4647. case LLM_ARCH_SMOLLM3:
  4648. {
  4649. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4650. // output
  4651. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4652. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4653. // if output is NULL, init from the input tok embed
  4654. if (output == NULL) {
  4655. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4656. }
  4657. for (int i = 0; i < n_layer; ++i) {
  4658. auto & layer = layers[i];
  4659. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4660. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4661. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4662. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4663. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4664. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4665. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4666. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4667. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4668. }
  4669. } break;
  4670. case LLM_ARCH_OPENAI_MOE:
  4671. {
  4672. const int64_t n_ff_exp = hparams.n_ff_exp;
  4673. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4674. // output
  4675. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4676. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4677. for (int i = 0; i < n_layer; ++i) {
  4678. auto & layer = layers[i];
  4679. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4680. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  4681. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
  4682. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  4683. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  4684. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
  4685. layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
  4686. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
  4687. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4688. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  4689. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4690. // bias
  4691. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_head * n_rot}, 0);
  4692. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_head_kv * n_rot}, 0);
  4693. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_head_kv * n_rot}, 0);
  4694. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  4695. layer.ffn_gate_inp_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0);
  4696. layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
  4697. layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), { n_embd, n_expert}, 0);
  4698. layer.ffn_up_exps_b = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
  4699. }
  4700. } break;
  4701. case LLM_ARCH_LFM2:
  4702. {
  4703. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4704. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  4705. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4706. if (output == NULL) {
  4707. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4708. }
  4709. for (int i = 0; i < n_layer; ++i) {
  4710. auto & layer = layers[i];
  4711. // ffn is same for transformer and conv layers
  4712. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4713. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4714. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4715. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4716. // for operator_norm
  4717. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4718. if (!hparams.is_recurrent(i)) {
  4719. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4720. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4721. GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
  4722. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  4723. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, hparams.n_embd_k_gqa(i)}, 0);
  4724. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, hparams.n_embd_v_gqa(i)}, 0);
  4725. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4726. } else {
  4727. layer.shortconv.conv = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV, "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0);
  4728. layer.shortconv.in_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ, "weight", i), {n_embd, 3 * n_embd}, 0);
  4729. layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
  4730. }
  4731. }
  4732. } break;
  4733. case LLM_ARCH_SMALLTHINKER:
  4734. {
  4735. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  4736. // output
  4737. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  4738. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4739. // if output is NULL, init from the input tok embed
  4740. if (output == NULL) {
  4741. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4742. }
  4743. for (int i = 0; i < n_layer; ++i) {
  4744. auto & layer = layers[i];
  4745. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  4746. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
  4747. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
  4748. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
  4749. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
  4750. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  4751. GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for SMALLTHINKER");
  4752. GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for SMALLTHINKER");
  4753. // MoE branch
  4754. const int64_t n_ff_exp = hparams.n_ff_exp;
  4755. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
  4756. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
  4757. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
  4758. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
  4759. }
  4760. } break;
  4761. default:
  4762. throw std::runtime_error("unknown architecture");
  4763. }
  4764. if (n_moved_tensors > 0) {
  4765. LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
  4766. __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
  4767. ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
  4768. }
  4769. }
  4770. ml.done_getting_tensors();
  4771. ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
  4772. pimpl->mappings.reserve(ml.mappings.size());
  4773. // create the backend buffers
  4774. std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
  4775. ctx_bufs.reserve(ctx_map.size());
  4776. // Ensure we have enough capacity for the maximum backend buffer we will potentially create
  4777. const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
  4778. pimpl->bufs.reserve(n_max_backend_buffer);
  4779. for (auto & it : ctx_map) {
  4780. ggml_backend_buffer_type_t buft = it.first;
  4781. ggml_context * ctx = it.second;
  4782. // skip contexts without tensors
  4783. if (ggml_get_first_tensor(ctx) == nullptr) {
  4784. continue;
  4785. }
  4786. llama_buf_map buf_map;
  4787. buf_map.reserve(n_max_backend_buffer);
  4788. // check if it is possible to use buffer_from_host_ptr with this buffer type
  4789. ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
  4790. if (!dev) {
  4791. // FIXME: workaround for CPU backend buft having a NULL device
  4792. dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  4793. if (!dev) {
  4794. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  4795. }
  4796. }
  4797. ggml_backend_dev_props props;
  4798. ggml_backend_dev_get_props(dev, &props);
  4799. bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
  4800. bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
  4801. if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
  4802. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  4803. // only the mmap region containing the tensors in the model is mapped to the backend buffer
  4804. // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
  4805. // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
  4806. void * addr = nullptr;
  4807. size_t first, last; // NOLINT
  4808. ml.get_mapping_range(&first, &last, &addr, idx, ctx);
  4809. if (first >= last) {
  4810. continue;
  4811. }
  4812. const size_t max_size = ggml_get_max_tensor_size(ctx);
  4813. ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
  4814. if (buf == nullptr) {
  4815. throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
  4816. }
  4817. pimpl->bufs.emplace_back(buf);
  4818. buf_map.emplace(idx, buf);
  4819. }
  4820. }
  4821. else {
  4822. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
  4823. if (buf == nullptr) {
  4824. throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
  4825. }
  4826. pimpl->bufs.emplace_back(buf);
  4827. if (use_mlock && ggml_backend_buffer_is_host(buf)) {
  4828. pimpl->mlock_bufs.emplace_back(new llama_mlock);
  4829. auto & mlock_buf = pimpl->mlock_bufs.back();
  4830. mlock_buf->init (ggml_backend_buffer_get_base(buf));
  4831. mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
  4832. }
  4833. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  4834. buf_map.emplace(idx, buf);
  4835. }
  4836. }
  4837. if (pimpl->bufs.empty()) {
  4838. throw std::runtime_error("failed to allocate buffer");
  4839. }
  4840. for (auto & buf : buf_map) {
  4841. // indicate that this buffer contains weights
  4842. // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
  4843. ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
  4844. }
  4845. ctx_bufs.emplace_back(ctx, buf_map);
  4846. }
  4847. if (llama_supports_gpu_offload()) {
  4848. const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
  4849. LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
  4850. if (n_gpu_layers > (int) hparams.n_layer) {
  4851. LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
  4852. }
  4853. const int max_backend_supported_layers = hparams.n_layer + 1;
  4854. const int max_offloadable_layers = hparams.n_layer + 1;
  4855. LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
  4856. }
  4857. // print memory requirements per buffer type
  4858. for (auto & buf : pimpl->bufs) {
  4859. LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
  4860. }
  4861. // populate tensors_by_name
  4862. for (auto & ctx : pimpl->ctxs) {
  4863. for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
  4864. tensors_by_name.emplace_back(ggml_get_name(cur), cur);
  4865. }
  4866. }
  4867. // load tensor data
  4868. for (auto & it : ctx_bufs) {
  4869. ggml_context * ctx = it.first;
  4870. auto & bufs = it.second;
  4871. if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
  4872. return false;
  4873. }
  4874. }
  4875. if (use_mmap_buffer) {
  4876. for (auto & mapping : ml.mappings) {
  4877. pimpl->mappings.emplace_back(std::move(mapping));
  4878. }
  4879. }
  4880. return true;
  4881. }
  4882. std::string llama_model::arch_name() const {
  4883. return llm_arch_name(arch);
  4884. }
  4885. std::string llama_model::type_name() const {
  4886. return llm_type_name(type);
  4887. }
  4888. std::string llama_model::desc() const {
  4889. return pimpl->desc_str;
  4890. }
  4891. size_t llama_model::size() const {
  4892. return pimpl->n_bytes;
  4893. }
  4894. size_t llama_model::n_tensors() const {
  4895. return tensors_by_name.size();
  4896. }
  4897. size_t llama_model::n_devices() const {
  4898. return devices.size();
  4899. }
  4900. uint64_t llama_model::n_elements() const {
  4901. return pimpl->n_elements;
  4902. }
  4903. void llama_model::print_info() const {
  4904. const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
  4905. auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
  4906. bool is_var = false;
  4907. std::vector<uint32_t> v;
  4908. for (uint32_t i = 0; i < n; ++i) {
  4909. v.push_back(f(i));
  4910. if (v[i] != v[0]) {
  4911. is_var = true;
  4912. }
  4913. }
  4914. std::stringstream ss;
  4915. if (is_var) {
  4916. ss << "[";
  4917. for (uint32_t i = 0; i < n; ++i) {
  4918. ss << v[i];
  4919. if (i < n - 1) {
  4920. ss << ", ";
  4921. }
  4922. }
  4923. ss << "]";
  4924. } else {
  4925. ss << v[0];
  4926. }
  4927. return ss.str();
  4928. };
  4929. // hparams
  4930. LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
  4931. LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
  4932. if (!hparams.vocab_only) {
  4933. LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
  4934. LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
  4935. LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
  4936. LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
  4937. LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
  4938. LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
  4939. LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
  4940. LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
  4941. LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
  4942. LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
  4943. LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
  4944. LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
  4945. LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
  4946. LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
  4947. LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
  4948. LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
  4949. LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
  4950. LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
  4951. LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
  4952. LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
  4953. LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
  4954. LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
  4955. LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
  4956. LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
  4957. LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
  4958. LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
  4959. LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
  4960. LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
  4961. LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
  4962. LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
  4963. if (!classifier_labels.empty()) {
  4964. LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
  4965. size_t i = 0;
  4966. for (auto label : classifier_labels) {
  4967. LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
  4968. }
  4969. }
  4970. }
  4971. if (arch == LLM_ARCH_MAMBA ||
  4972. arch == LLM_ARCH_MAMBA2 ||
  4973. arch == LLM_ARCH_JAMBA ||
  4974. arch == LLM_ARCH_FALCON_H1 ||
  4975. arch == LLM_ARCH_PLAMO2 ||
  4976. arch == LLM_ARCH_GRANITE_HYBRID ||
  4977. arch == LLM_ARCH_NEMOTRON_H) {
  4978. LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
  4979. LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
  4980. LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
  4981. LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
  4982. LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
  4983. LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
  4984. }
  4985. LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
  4986. if (pimpl->n_elements >= 1e12) {
  4987. LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
  4988. } else if (pimpl->n_elements >= 1e9) {
  4989. LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
  4990. } else if (pimpl->n_elements >= 1e6) {
  4991. LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
  4992. } else {
  4993. LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
  4994. }
  4995. // general kv
  4996. LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
  4997. if (arch == LLM_ARCH_DEEPSEEK) {
  4998. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  4999. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5000. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  5001. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  5002. }
  5003. if (arch == LLM_ARCH_DEEPSEEK2) {
  5004. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  5005. LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
  5006. LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
  5007. LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
  5008. LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
  5009. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5010. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  5011. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  5012. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  5013. LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
  5014. LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
  5015. }
  5016. if (arch == LLM_ARCH_QWEN2MOE) {
  5017. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5018. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  5019. }
  5020. if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE) {
  5021. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5022. }
  5023. if (arch == LLM_ARCH_MINICPM ||
  5024. arch == LLM_ARCH_GRANITE ||
  5025. arch == LLM_ARCH_GRANITE_MOE ||
  5026. arch == LLM_ARCH_GRANITE_HYBRID) {
  5027. LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
  5028. LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
  5029. LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
  5030. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  5031. }
  5032. if (arch == LLM_ARCH_BAILINGMOE) {
  5033. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  5034. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5035. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  5036. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  5037. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  5038. }
  5039. if (arch == LLM_ARCH_SMALLTHINKER) {
  5040. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5041. LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
  5042. }
  5043. vocab.print_info();
  5044. }
  5045. ggml_backend_dev_t llama_model::dev_layer(int il) const {
  5046. return pimpl->dev_layer.at(il).dev;
  5047. }
  5048. ggml_backend_dev_t llama_model::dev_output() const {
  5049. return pimpl->dev_output.dev;
  5050. }
  5051. template<typename F>
  5052. static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
  5053. ggml_init_params params = {
  5054. /*.mem_size =*/ ggml_tensor_overhead()*8,
  5055. /*.mem_buffer =*/ NULL,
  5056. /*.no_alloc =*/ true,
  5057. };
  5058. ggml_context_ptr ctx { ggml_init(params) };
  5059. if (!ctx) {
  5060. throw std::runtime_error(format("failed to create ggml context"));
  5061. }
  5062. ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
  5063. ggml_tensor * op_tensor = fn(ctx.get());
  5064. for (int i = 0; i < GGML_MAX_SRC; i++) {
  5065. if (op_tensor->src[i] != nullptr) {
  5066. assert(op_tensor->src[i]->buffer == nullptr);
  5067. op_tensor->src[i]->buffer = buf.get();
  5068. }
  5069. }
  5070. bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  5071. return op_supported;
  5072. }
  5073. template<typename F>
  5074. static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
  5075. for (const auto & cur : buft_list) {
  5076. ggml_backend_dev_t cur_dev = cur.first;
  5077. ggml_backend_buffer_type_t cur_buft = cur.second;
  5078. if (buft_supported(cur_buft, cur_dev, fn)) {
  5079. return cur_buft;
  5080. }
  5081. }
  5082. throw std::runtime_error(format("no suitable buffer type found"));
  5083. }
  5084. ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
  5085. return ::select_buft(
  5086. *pimpl->dev_layer.at(il).buft_list,
  5087. [&](ggml_context * ctx) {
  5088. ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  5089. ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  5090. return ggml_add(ctx, cur, layer_dir);
  5091. });
  5092. }
  5093. bool llama_model::has_tensor_overrides() const {
  5094. return pimpl->has_tensor_overrides;
  5095. }
  5096. const ggml_tensor * llama_model::get_tensor(const char * name) const {
  5097. auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
  5098. [name](const std::pair<std::string, ggml_tensor *> & it) {
  5099. return it.first == name;
  5100. });
  5101. if (it == tensors_by_name.end()) {
  5102. return nullptr;
  5103. }
  5104. return it->second;
  5105. }
  5106. float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
  5107. return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
  5108. }
  5109. float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
  5110. return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
  5111. }
  5112. ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
  5113. const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
  5114. // choose long/short freq factors based on the context size
  5115. if (layers[il].rope_freqs != nullptr) {
  5116. return layers[il].rope_freqs;
  5117. }
  5118. if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
  5119. return layers[il].rope_long;
  5120. }
  5121. return layers[il].rope_short;
  5122. }
  5123. struct llm_build_llama : public llm_graph_context {
  5124. llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5125. const int64_t n_embd_head = hparams.n_embd_head_v;
  5126. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5127. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5128. ggml_tensor * cur;
  5129. ggml_tensor * inpL;
  5130. inpL = build_inp_embd(model.tok_embd);
  5131. // inp_pos - contains the positions
  5132. ggml_tensor * inp_pos = build_inp_pos();
  5133. auto * inp_attn = build_attn_inp_kv();
  5134. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  5135. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5136. for (int il = 0; il < n_layer; ++il) {
  5137. ggml_tensor * inpSA = inpL;
  5138. // norm
  5139. cur = build_norm(inpL,
  5140. model.layers[il].attn_norm, NULL,
  5141. LLM_NORM_RMS, il);
  5142. cb(cur, "attn_norm", il);
  5143. // self-attention
  5144. {
  5145. // rope freq factors for llama3; may return nullptr for llama2 and other models
  5146. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  5147. // compute Q and K and RoPE them
  5148. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5149. cb(Qcur, "Qcur", il);
  5150. if (model.layers[il].bq) {
  5151. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  5152. cb(Qcur, "Qcur", il);
  5153. }
  5154. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5155. cb(Kcur, "Kcur", il);
  5156. if (model.layers[il].bk) {
  5157. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  5158. cb(Kcur, "Kcur", il);
  5159. }
  5160. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5161. cb(Vcur, "Vcur", il);
  5162. if (model.layers[il].bv) {
  5163. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  5164. cb(Vcur, "Vcur", il);
  5165. }
  5166. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5167. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5168. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5169. Qcur = ggml_rope_ext(
  5170. ctx0, Qcur, inp_pos, rope_factors,
  5171. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5172. ext_factor, attn_factor, beta_fast, beta_slow
  5173. );
  5174. Kcur = ggml_rope_ext(
  5175. ctx0, Kcur, inp_pos, rope_factors,
  5176. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5177. ext_factor, attn_factor, beta_fast, beta_slow
  5178. );
  5179. cb(Qcur, "Qcur", il);
  5180. cb(Kcur, "Kcur", il);
  5181. cb(Vcur, "Vcur", il);
  5182. cur = build_attn(inp_attn,
  5183. model.layers[il].wo, model.layers[il].bo,
  5184. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  5185. cb(cur, "attn_out", il);
  5186. }
  5187. if (il == n_layer - 1 && inp_out_ids) {
  5188. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5189. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5190. }
  5191. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5192. cb(ffn_inp, "ffn_inp", il);
  5193. // feed-forward network (non-MoE)
  5194. if (model.layers[il].ffn_gate_inp == nullptr) {
  5195. cur = build_norm(ffn_inp,
  5196. model.layers[il].ffn_norm, NULL,
  5197. LLM_NORM_RMS, il);
  5198. cb(cur, "ffn_norm", il);
  5199. cur = build_ffn(cur,
  5200. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  5201. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  5202. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  5203. NULL,
  5204. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5205. cb(cur, "ffn_out", il);
  5206. } else {
  5207. // MoE branch
  5208. cur = build_norm(ffn_inp,
  5209. model.layers[il].ffn_norm, NULL,
  5210. LLM_NORM_RMS, il);
  5211. cb(cur, "ffn_norm", il);
  5212. cur = build_moe_ffn(cur,
  5213. model.layers[il].ffn_gate_inp,
  5214. model.layers[il].ffn_up_exps,
  5215. model.layers[il].ffn_gate_exps,
  5216. model.layers[il].ffn_down_exps,
  5217. nullptr,
  5218. n_expert, n_expert_used,
  5219. LLM_FFN_SILU, true,
  5220. false, 0.0,
  5221. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  5222. il);
  5223. cb(cur, "ffn_moe_out", il);
  5224. }
  5225. cur = ggml_add(ctx0, cur, ffn_inp);
  5226. cb(cur, "ffn_out", il);
  5227. cur = build_cvec(cur, il);
  5228. cb(cur, "l_out", il);
  5229. // input for next layer
  5230. inpL = cur;
  5231. }
  5232. cur = inpL;
  5233. cur = build_norm(cur,
  5234. model.output_norm, NULL,
  5235. LLM_NORM_RMS, -1);
  5236. cb(cur, "result_norm", -1);
  5237. res->t_embd = cur;
  5238. // lm_head
  5239. cur = build_lora_mm(model.output, cur);
  5240. cb(cur, "result_output", -1);
  5241. res->t_logits = cur;
  5242. ggml_build_forward_expand(gf, cur);
  5243. }
  5244. };
  5245. struct llm_build_llama_iswa : public llm_graph_context {
  5246. llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5247. const int64_t n_embd_head = hparams.n_embd_head_v;
  5248. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5249. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5250. ggml_tensor * cur;
  5251. ggml_tensor * inpL;
  5252. inpL = build_inp_embd(model.tok_embd);
  5253. // inp_pos - contains the positions
  5254. ggml_tensor * inp_pos = build_inp_pos();
  5255. // temperature tuning
  5256. ggml_tensor * inp_attn_scale = nullptr;
  5257. inp_attn_scale = build_inp_attn_scale();
  5258. auto * inp_attn = build_attn_inp_kv_iswa();
  5259. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  5260. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5261. for (int il = 0; il < n_layer; ++il) {
  5262. ggml_tensor * inpSA = inpL;
  5263. const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
  5264. // norm
  5265. cur = build_norm(inpL,
  5266. model.layers[il].attn_norm, NULL,
  5267. LLM_NORM_RMS, il);
  5268. cb(cur, "attn_norm", il);
  5269. // self-attention
  5270. {
  5271. // rope freq factors for llama3; may return nullptr for llama2 and other models
  5272. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  5273. // compute Q and K and RoPE them
  5274. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5275. cb(Qcur, "Qcur", il);
  5276. if (model.layers[il].bq) {
  5277. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  5278. cb(Qcur, "Qcur", il);
  5279. }
  5280. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5281. cb(Kcur, "Kcur", il);
  5282. if (model.layers[il].bk) {
  5283. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  5284. cb(Kcur, "Kcur", il);
  5285. }
  5286. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5287. cb(Vcur, "Vcur", il);
  5288. if (model.layers[il].bv) {
  5289. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  5290. cb(Vcur, "Vcur", il);
  5291. }
  5292. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5293. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5294. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5295. if (use_rope) {
  5296. Qcur = ggml_rope_ext(
  5297. ctx0, Qcur, inp_pos, rope_factors,
  5298. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5299. ext_factor, attn_factor, beta_fast, beta_slow
  5300. );
  5301. Kcur = ggml_rope_ext(
  5302. ctx0, Kcur, inp_pos, rope_factors,
  5303. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5304. ext_factor, attn_factor, beta_fast, beta_slow
  5305. );
  5306. } else if (inp_attn_scale) {
  5307. Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
  5308. }
  5309. cb(Qcur, "Qcur", il);
  5310. cb(Kcur, "Kcur", il);
  5311. cb(Vcur, "Vcur", il);
  5312. if (use_rope && hparams.use_kq_norm) {
  5313. // Llama4TextL2Norm
  5314. Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
  5315. Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
  5316. cb(Qcur, "Qcur_normed", il);
  5317. cb(Kcur, "Kcur_normed", il);
  5318. }
  5319. cur = build_attn(inp_attn,
  5320. model.layers[il].wo, model.layers[il].bo,
  5321. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  5322. cb(cur, "attn_out", il);
  5323. }
  5324. if (il == n_layer - 1 && inp_out_ids) {
  5325. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5326. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5327. }
  5328. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5329. cb(ffn_inp, "ffn_inp", il);
  5330. // feed-forward network (non-MoE)
  5331. if (model.layers[il].ffn_gate_inp == nullptr) {
  5332. cur = build_norm(ffn_inp,
  5333. model.layers[il].ffn_norm, NULL,
  5334. LLM_NORM_RMS, il);
  5335. cb(cur, "ffn_norm", il);
  5336. cur = build_ffn(cur,
  5337. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  5338. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  5339. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  5340. NULL,
  5341. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5342. cb(cur, "ffn_out", il);
  5343. } else {
  5344. ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
  5345. model.layers[il].ffn_norm, NULL,
  5346. LLM_NORM_RMS, il);
  5347. cb(cur, "ffn_norm", il);
  5348. ggml_tensor * moe_out = build_moe_ffn(ffn_inp_normed,
  5349. model.layers[il].ffn_gate_inp,
  5350. model.layers[il].ffn_up_exps,
  5351. model.layers[il].ffn_gate_exps,
  5352. model.layers[il].ffn_down_exps,
  5353. nullptr,
  5354. n_expert, n_expert_used,
  5355. LLM_FFN_SILU, false,
  5356. false, 0.0,
  5357. LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
  5358. il);
  5359. // Shared experts
  5360. ggml_tensor * shexp_out = build_ffn(ffn_inp_normed,
  5361. model.layers[il].ffn_up_shexp, NULL, NULL,
  5362. model.layers[il].ffn_gate_shexp, NULL, NULL,
  5363. model.layers[il].ffn_down_shexp, NULL, NULL,
  5364. NULL,
  5365. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5366. cb(shexp_out, "ffn_moe_shexp", il);
  5367. cur = ggml_add(ctx0, moe_out, shexp_out);
  5368. cb(cur, "ffn_moe_out_merged", il);
  5369. }
  5370. cur = ggml_add(ctx0, cur, ffn_inp);
  5371. cb(cur, "ffn_out", il);
  5372. cur = build_cvec(cur, il);
  5373. cb(cur, "l_out", il);
  5374. // input for next layer
  5375. inpL = cur;
  5376. }
  5377. cur = inpL;
  5378. cur = build_norm(cur,
  5379. model.output_norm, NULL,
  5380. LLM_NORM_RMS, -1);
  5381. cb(cur, "result_norm", -1);
  5382. res->t_embd = cur;
  5383. // lm_head
  5384. cur = build_lora_mm(model.output, cur);
  5385. cb(cur, "result_output", -1);
  5386. res->t_logits = cur;
  5387. ggml_build_forward_expand(gf, cur);
  5388. }
  5389. };
  5390. struct llm_build_deci : public llm_graph_context {
  5391. llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5392. const int64_t n_embd_head = hparams.n_embd_head_v;
  5393. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5394. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5395. ggml_tensor * cur;
  5396. ggml_tensor * inpL;
  5397. inpL = build_inp_embd(model.tok_embd);
  5398. // inp_pos - contains the positions
  5399. ggml_tensor * inp_pos = build_inp_pos();
  5400. auto * inp_attn = build_attn_inp_kv();
  5401. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  5402. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5403. for (int il = 0; il < n_layer; ++il) {
  5404. ggml_tensor * inpSA = inpL;
  5405. const int64_t n_head_kv = hparams.n_head_kv(il);
  5406. const int64_t n_head = hparams.n_head(il);
  5407. const int64_t n_ff = hparams.n_ff(il);
  5408. if (n_head == 0) {
  5409. // attention-free layer of Llama-3_1-Nemotron-51B
  5410. cur = inpL;
  5411. } else {
  5412. // norm
  5413. cur = build_norm(inpL,
  5414. model.layers[il].attn_norm, NULL,
  5415. LLM_NORM_RMS, il);
  5416. cb(cur, "attn_norm", il);
  5417. }
  5418. if (n_head > 0 && n_head_kv == 0) {
  5419. // "linear attention" of Llama-3_1-Nemotron-51B
  5420. cur = build_lora_mm(model.layers[il].wo, cur);
  5421. cb(cur, "wo", il);
  5422. } else if (n_head > 0) {
  5423. // self-attention
  5424. // rope freq factors for llama3; may return nullptr for llama2 and other models
  5425. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  5426. // compute Q and K and RoPE them
  5427. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5428. cb(Qcur, "Qcur", il);
  5429. if (model.layers[il].bq) {
  5430. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  5431. cb(Qcur, "Qcur", il);
  5432. }
  5433. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5434. cb(Kcur, "Kcur", il);
  5435. if (model.layers[il].bk) {
  5436. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  5437. cb(Kcur, "Kcur", il);
  5438. }
  5439. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5440. cb(Vcur, "Vcur", il);
  5441. if (model.layers[il].bv) {
  5442. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  5443. cb(Vcur, "Vcur", il);
  5444. }
  5445. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5446. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5447. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5448. Qcur = ggml_rope_ext(
  5449. ctx0, Qcur, inp_pos, rope_factors,
  5450. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5451. ext_factor, attn_factor, beta_fast, beta_slow
  5452. );
  5453. Kcur = ggml_rope_ext(
  5454. ctx0, Kcur, inp_pos, rope_factors,
  5455. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5456. ext_factor, attn_factor, beta_fast, beta_slow
  5457. );
  5458. cb(Qcur, "Qcur", il);
  5459. cb(Kcur, "Kcur", il);
  5460. cb(Vcur, "Vcur", il);
  5461. cur = build_attn(inp_attn,
  5462. model.layers[il].wo, model.layers[il].bo,
  5463. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  5464. }
  5465. if (il == n_layer - 1 && inp_out_ids) {
  5466. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5467. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5468. }
  5469. // FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
  5470. if (n_ff == 0) {
  5471. continue;
  5472. }
  5473. // modified to support attention-free layer of Llama-3_1-Nemotron-51B
  5474. ggml_tensor * ffn_inp = cur;
  5475. if (n_head > 0) {
  5476. ffn_inp = ggml_add(ctx0, cur, inpSA);
  5477. cb(ffn_inp, "ffn_inp", il);
  5478. }
  5479. // feed-forward network
  5480. if (model.layers[il].ffn_gate_inp == nullptr) {
  5481. cur = build_norm(ffn_inp,
  5482. model.layers[il].ffn_norm, NULL,
  5483. LLM_NORM_RMS, il);
  5484. cb(cur, "ffn_norm", il);
  5485. cur = build_ffn(cur,
  5486. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  5487. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  5488. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  5489. NULL,
  5490. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5491. cb(cur, "ffn_out", il);
  5492. }
  5493. cur = ggml_add(ctx0, cur, ffn_inp);
  5494. cb(cur, "ffn_out", il);
  5495. cur = build_cvec(cur, il);
  5496. cb(cur, "l_out", il);
  5497. // input for next layer
  5498. inpL = cur;
  5499. }
  5500. cur = inpL;
  5501. cur = build_norm(cur,
  5502. model.output_norm, NULL,
  5503. LLM_NORM_RMS, -1);
  5504. cb(cur, "result_norm", -1);
  5505. res->t_embd = cur;
  5506. // lm_head
  5507. cur = build_lora_mm(model.output, cur);
  5508. cb(cur, "result_output", -1);
  5509. res->t_logits = cur;
  5510. ggml_build_forward_expand(gf, cur);
  5511. }
  5512. };
  5513. struct llm_build_baichuan : public llm_graph_context {
  5514. llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5515. const int64_t n_embd_head = hparams.n_embd_head_v;
  5516. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5517. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5518. ggml_tensor * cur;
  5519. ggml_tensor * inpL;
  5520. inpL = build_inp_embd(model.tok_embd);
  5521. // inp_pos - contains the positions
  5522. ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
  5523. auto * inp_attn = build_attn_inp_kv();
  5524. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5525. for (int il = 0; il < n_layer; ++il) {
  5526. ggml_tensor * inpSA = inpL;
  5527. cur = build_norm(inpL,
  5528. model.layers[il].attn_norm, NULL,
  5529. LLM_NORM_RMS, il);
  5530. cb(cur, "attn_norm", il);
  5531. // self-attention
  5532. {
  5533. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5534. cb(Qcur, "Qcur", il);
  5535. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5536. cb(Kcur, "Kcur", il);
  5537. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5538. cb(Vcur, "Vcur", il);
  5539. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5540. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5541. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5542. switch (model.type) {
  5543. case LLM_TYPE_7B:
  5544. Qcur = ggml_rope_ext(
  5545. ctx0, Qcur, inp_pos, nullptr,
  5546. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5547. ext_factor, attn_factor, beta_fast, beta_slow
  5548. );
  5549. Kcur = ggml_rope_ext(
  5550. ctx0, Kcur, inp_pos, nullptr,
  5551. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5552. ext_factor, attn_factor, beta_fast, beta_slow
  5553. );
  5554. break;
  5555. case LLM_TYPE_13B:
  5556. break;
  5557. default:
  5558. GGML_ABORT("fatal error");
  5559. }
  5560. cb(Qcur, "Qcur", il);
  5561. cb(Kcur, "Kcur", il);
  5562. cb(Vcur, "Vcur", il);
  5563. cur = build_attn(inp_attn,
  5564. model.layers[il].wo, NULL,
  5565. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5566. }
  5567. if (il == n_layer - 1 && inp_out_ids) {
  5568. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5569. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5570. }
  5571. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5572. cb(ffn_inp, "ffn_inp", il);
  5573. // feed-forward network
  5574. {
  5575. cur = build_norm(ffn_inp,
  5576. model.layers[il].ffn_norm, NULL,
  5577. LLM_NORM_RMS, il);
  5578. cb(cur, "ffn_norm", il);
  5579. cur = build_ffn(cur,
  5580. model.layers[il].ffn_up, NULL, NULL,
  5581. model.layers[il].ffn_gate, NULL, NULL,
  5582. model.layers[il].ffn_down, NULL, NULL,
  5583. NULL,
  5584. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5585. cb(cur, "ffn_out", il);
  5586. }
  5587. cur = ggml_add(ctx0, cur, ffn_inp);
  5588. cur = build_cvec(cur, il);
  5589. cb(cur, "l_out", il);
  5590. // input for next layer
  5591. inpL = cur;
  5592. }
  5593. cur = inpL;
  5594. cur = build_norm(cur,
  5595. model.output_norm, NULL,
  5596. LLM_NORM_RMS, -1);
  5597. cb(cur, "result_norm", -1);
  5598. res->t_embd = cur;
  5599. // lm_head
  5600. cur = build_lora_mm(model.output, cur);
  5601. cb(cur, "result_output", -1);
  5602. res->t_logits = cur;
  5603. ggml_build_forward_expand(gf, cur);
  5604. }
  5605. };
  5606. struct llm_build_xverse : public llm_graph_context {
  5607. llm_build_xverse(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5608. const int64_t n_embd_head = hparams.n_embd_head_v;
  5609. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5610. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5611. ggml_tensor * cur;
  5612. ggml_tensor * inpL;
  5613. inpL = build_inp_embd(model.tok_embd);
  5614. // inp_pos - contains the positions
  5615. ggml_tensor * inp_pos = build_inp_pos();
  5616. auto * inp_attn = build_attn_inp_kv();
  5617. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5618. for (int il = 0; il < n_layer; ++il) {
  5619. ggml_tensor * inpSA = inpL;
  5620. cur = build_norm(inpL,
  5621. model.layers[il].attn_norm, NULL,
  5622. LLM_NORM_RMS, il);
  5623. cb(cur, "attn_norm", il);
  5624. // self-attention
  5625. {
  5626. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5627. cb(Qcur, "Qcur", il);
  5628. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5629. cb(Kcur, "Kcur", il);
  5630. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5631. cb(Vcur, "Vcur", il);
  5632. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5633. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5634. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5635. Qcur = ggml_rope_ext(
  5636. ctx0, Qcur, inp_pos, nullptr,
  5637. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5638. ext_factor, attn_factor, beta_fast, beta_slow
  5639. );
  5640. Kcur = ggml_rope_ext(
  5641. ctx0, Kcur, inp_pos, nullptr,
  5642. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5643. ext_factor, attn_factor, beta_fast, beta_slow
  5644. );
  5645. cb(Qcur, "Qcur", il);
  5646. cb(Kcur, "Kcur", il);
  5647. cb(Vcur, "Vcur", il);
  5648. cur = build_attn(inp_attn,
  5649. model.layers[il].wo, NULL,
  5650. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5651. }
  5652. if (il == n_layer - 1 && inp_out_ids) {
  5653. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5654. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5655. }
  5656. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5657. cb(ffn_inp, "ffn_inp", il);
  5658. // feed-forward network
  5659. {
  5660. cur = build_norm(ffn_inp,
  5661. model.layers[il].ffn_norm, NULL,
  5662. LLM_NORM_RMS, il);
  5663. cb(cur, "ffn_norm", il);
  5664. cur = build_ffn(cur,
  5665. model.layers[il].ffn_up, NULL, NULL,
  5666. model.layers[il].ffn_gate, NULL, NULL,
  5667. model.layers[il].ffn_down, NULL, NULL,
  5668. NULL,
  5669. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5670. cb(cur, "ffn_out", il);
  5671. }
  5672. cur = ggml_add(ctx0, cur, ffn_inp);
  5673. cur = build_cvec(cur, il);
  5674. cb(cur, "l_out", il);
  5675. // input for next layer
  5676. inpL = cur;
  5677. }
  5678. cur = inpL;
  5679. cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
  5680. cb(cur, "result_norm", -1);
  5681. res->t_embd = cur;
  5682. // lm_head
  5683. cur = build_lora_mm(model.output, cur);
  5684. cb(cur, "result_output", -1);
  5685. res->t_logits = cur;
  5686. ggml_build_forward_expand(gf, cur);
  5687. }
  5688. };
  5689. struct llm_build_falcon : public llm_graph_context {
  5690. llm_build_falcon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5691. const int64_t n_embd_head = hparams.n_embd_head_v;
  5692. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  5693. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5694. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5695. ggml_tensor * cur;
  5696. ggml_tensor * inpL;
  5697. inpL = build_inp_embd(model.tok_embd);
  5698. // inp_pos - contains the positions
  5699. ggml_tensor * inp_pos = build_inp_pos();
  5700. auto * inp_attn = build_attn_inp_kv();
  5701. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5702. for (int il = 0; il < n_layer; ++il) {
  5703. ggml_tensor * attn_norm;
  5704. attn_norm = build_norm(inpL,
  5705. model.layers[il].attn_norm,
  5706. model.layers[il].attn_norm_b,
  5707. LLM_NORM, il);
  5708. cb(attn_norm, "attn_norm", il);
  5709. // self-attention
  5710. {
  5711. if (model.layers[il].attn_norm_2) {
  5712. // Falcon-40B
  5713. cur = build_norm(inpL,
  5714. model.layers[il].attn_norm_2,
  5715. model.layers[il].attn_norm_2_b,
  5716. LLM_NORM, il);
  5717. cb(cur, "attn_norm_2", il);
  5718. } else {
  5719. cur = attn_norm;
  5720. }
  5721. cur = build_lora_mm(model.layers[il].wqkv, cur);
  5722. cb(cur, "wqkv", il);
  5723. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  5724. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  5725. ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  5726. // using mode = 2 for neox mode
  5727. Qcur = ggml_rope_ext(
  5728. ctx0, Qcur, inp_pos, nullptr,
  5729. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5730. ext_factor, attn_factor, beta_fast, beta_slow
  5731. );
  5732. Kcur = ggml_rope_ext(
  5733. ctx0, Kcur, inp_pos, nullptr,
  5734. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5735. ext_factor, attn_factor, beta_fast, beta_slow
  5736. );
  5737. cb(Qcur, "Qcur", il);
  5738. cb(Kcur, "Kcur", il);
  5739. cb(Vcur, "Vcur", il);
  5740. cur = build_attn(inp_attn,
  5741. model.layers[il].wo, NULL,
  5742. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5743. }
  5744. if (il == n_layer - 1 && inp_out_ids) {
  5745. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5746. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  5747. attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
  5748. }
  5749. ggml_tensor * ffn_inp = cur;
  5750. // feed forward
  5751. {
  5752. cur = build_ffn(attn_norm, // !! use the attn norm, not the result
  5753. model.layers[il].ffn_up, NULL, NULL,
  5754. NULL, NULL, NULL,
  5755. model.layers[il].ffn_down, NULL, NULL,
  5756. NULL,
  5757. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  5758. cb(cur, "ffn_out", il);
  5759. }
  5760. cur = ggml_add(ctx0, cur, ffn_inp);
  5761. cur = ggml_add(ctx0, cur, inpL);
  5762. cur = build_cvec(cur, il);
  5763. cb(cur, "l_out", il);
  5764. // input for next layer
  5765. inpL = cur;
  5766. }
  5767. cur = inpL;
  5768. // norm
  5769. cur = build_norm(cur,
  5770. model.output_norm,
  5771. model.output_norm_b,
  5772. LLM_NORM, -1);
  5773. cb(cur, "result_norm", -1);
  5774. res->t_embd = cur;
  5775. cur = build_lora_mm(model.output, cur);
  5776. cb(cur, "result_output", -1);
  5777. res->t_logits = cur;
  5778. ggml_build_forward_expand(gf, cur);
  5779. }
  5780. };
  5781. struct llm_build_grok : public llm_graph_context {
  5782. llm_build_grok(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5783. const int64_t n_embd_head = hparams.n_embd_head_v;
  5784. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5785. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5786. ggml_tensor * cur;
  5787. ggml_tensor * inpL;
  5788. inpL = build_inp_embd(model.tok_embd);
  5789. // multiply by embedding_multiplier_scale of 78.38367176906169
  5790. inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
  5791. // inp_pos - contains the positions
  5792. ggml_tensor * inp_pos = build_inp_pos();
  5793. auto * inp_attn = build_attn_inp_kv();
  5794. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5795. for (int il = 0; il < n_layer; ++il) {
  5796. ggml_tensor * inpSA = inpL;
  5797. // norm
  5798. cur = build_norm(inpL,
  5799. model.layers[il].attn_norm, NULL,
  5800. LLM_NORM_RMS, il);
  5801. cb(cur, "attn_norm", il);
  5802. // self-attention
  5803. {
  5804. // compute Q and K and RoPE them
  5805. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5806. cb(Qcur, "Qcur", il);
  5807. if (model.layers[il].bq) {
  5808. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  5809. cb(Qcur, "Qcur", il);
  5810. }
  5811. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5812. cb(Kcur, "Kcur", il);
  5813. if (model.layers[il].bk) {
  5814. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  5815. cb(Kcur, "Kcur", il);
  5816. }
  5817. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5818. cb(Vcur, "Vcur", il);
  5819. if (model.layers[il].bv) {
  5820. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  5821. cb(Vcur, "Vcur", il);
  5822. }
  5823. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5824. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5825. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5826. Qcur = ggml_rope_ext(
  5827. ctx0, Qcur, inp_pos, nullptr,
  5828. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5829. ext_factor, attn_factor, beta_fast, beta_slow
  5830. );
  5831. Kcur = ggml_rope_ext(
  5832. ctx0, Kcur, inp_pos, nullptr,
  5833. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5834. ext_factor, attn_factor, beta_fast, beta_slow
  5835. );
  5836. cb(Qcur, "Qcur", il);
  5837. cb(Kcur, "Kcur", il);
  5838. cb(Vcur, "Vcur", il);
  5839. cur = build_attn(inp_attn,
  5840. model.layers[il].wo, model.layers[il].bo,
  5841. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
  5842. }
  5843. if (il == n_layer - 1 && inp_out_ids) {
  5844. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5845. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5846. }
  5847. // Grok
  5848. // if attn_out_norm is present then apply it before adding the input
  5849. if (model.layers[il].attn_out_norm) {
  5850. cur = build_norm(cur,
  5851. model.layers[il].attn_out_norm, NULL,
  5852. LLM_NORM_RMS, il);
  5853. cb(cur, "attn_out_norm", il);
  5854. }
  5855. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5856. cb(ffn_inp, "ffn_inp", il);
  5857. // feed-forward network
  5858. // MoE branch
  5859. cur = build_norm(ffn_inp,
  5860. model.layers[il].ffn_norm, NULL,
  5861. LLM_NORM_RMS, il);
  5862. cb(cur, "ffn_norm", il);
  5863. cur = build_moe_ffn(cur,
  5864. model.layers[il].ffn_gate_inp,
  5865. model.layers[il].ffn_up_exps,
  5866. model.layers[il].ffn_gate_exps,
  5867. model.layers[il].ffn_down_exps,
  5868. nullptr,
  5869. n_expert, n_expert_used,
  5870. LLM_FFN_GELU, true,
  5871. false, 0.0,
  5872. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  5873. il);
  5874. cb(cur, "ffn_moe_out", il);
  5875. // Grok
  5876. // if layer_out_norm is present then apply it before adding the input
  5877. // Idea: maybe ffn_out_norm is a better name
  5878. if (model.layers[il].layer_out_norm) {
  5879. cur = build_norm(cur,
  5880. model.layers[il].layer_out_norm, NULL,
  5881. LLM_NORM_RMS, il);
  5882. cb(cur, "layer_out_norm", il);
  5883. }
  5884. cur = ggml_add(ctx0, cur, ffn_inp);
  5885. cb(cur, "ffn_out", il);
  5886. cur = build_cvec(cur, il);
  5887. cb(cur, "l_out", il);
  5888. // input for next layer
  5889. inpL = cur;
  5890. }
  5891. cur = inpL;
  5892. cur = build_norm(cur,
  5893. model.output_norm, NULL,
  5894. LLM_NORM_RMS, -1);
  5895. cb(cur, "result_norm", -1);
  5896. res->t_embd = cur;
  5897. // lm_head
  5898. cur = build_lora_mm(model.output, cur);
  5899. // Grok
  5900. // multiply logits by output_multiplier_scale of 0.5773502691896257
  5901. cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
  5902. cb(cur, "result_output", -1);
  5903. res->t_logits = cur;
  5904. ggml_build_forward_expand(gf, cur);
  5905. }
  5906. };
  5907. struct llm_build_dbrx : public llm_graph_context {
  5908. llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5909. const int64_t n_embd_head = hparams.n_embd_head_v;
  5910. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  5911. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5912. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5913. ggml_tensor * cur;
  5914. ggml_tensor * inpL;
  5915. inpL = build_inp_embd(model.tok_embd);
  5916. // inp_pos - contains the positions
  5917. ggml_tensor * inp_pos = build_inp_pos();
  5918. auto * inp_attn = build_attn_inp_kv();
  5919. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5920. for (int il = 0; il < n_layer; ++il) {
  5921. ggml_tensor * inpSA = inpL;
  5922. // norm
  5923. cur = build_norm(inpL,
  5924. model.layers[il].attn_norm, NULL,
  5925. LLM_NORM, il);
  5926. cb(cur, "attn_norm", il);
  5927. // self-attention
  5928. {
  5929. ggml_tensor * Qcur = nullptr;
  5930. ggml_tensor * Kcur = nullptr;
  5931. ggml_tensor * Vcur = nullptr;
  5932. cur = build_lora_mm(model.layers[il].wqkv, cur);
  5933. cb(cur, "wqkv", il);
  5934. cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  5935. cb(cur, "wqkv_clamped", il);
  5936. Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  5937. Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  5938. Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  5939. Qcur = ggml_rope_ext(
  5940. ctx0, Qcur, inp_pos, nullptr,
  5941. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5942. ext_factor, attn_factor, beta_fast, beta_slow
  5943. );
  5944. Kcur = ggml_rope_ext(
  5945. ctx0, Kcur, inp_pos, nullptr,
  5946. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5947. ext_factor, attn_factor, beta_fast, beta_slow
  5948. );
  5949. cb(Qcur, "Qcur", il);
  5950. cb(Kcur, "Kcur", il);
  5951. cb(Vcur, "Vcur", il);
  5952. cur = build_attn(inp_attn,
  5953. model.layers[il].wo, NULL,
  5954. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5955. }
  5956. if (il == n_layer - 1 && inp_out_ids) {
  5957. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5958. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5959. }
  5960. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5961. cb(ffn_inp, "ffn_inp", il);
  5962. // feed-forward network
  5963. // MoE branch
  5964. cur = build_norm(ffn_inp,
  5965. model.layers[il].attn_out_norm, NULL,
  5966. LLM_NORM, il);
  5967. cb(cur, "attn_out_norm", il);
  5968. cur = build_moe_ffn(cur,
  5969. model.layers[il].ffn_gate_inp,
  5970. model.layers[il].ffn_up_exps,
  5971. model.layers[il].ffn_gate_exps,
  5972. model.layers[il].ffn_down_exps,
  5973. nullptr,
  5974. n_expert, n_expert_used,
  5975. LLM_FFN_SILU, true,
  5976. false, 0.0,
  5977. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  5978. il);
  5979. cb(cur, "ffn_moe_out", il);
  5980. cur = ggml_add(ctx0, cur, ffn_inp);
  5981. cb(cur, "ffn_out", il);
  5982. cur = build_cvec(cur, il);
  5983. cb(cur, "l_out", il);
  5984. // input for next layer
  5985. inpL = cur;
  5986. }
  5987. cur = inpL;
  5988. cur = build_norm(cur,
  5989. model.output_norm, NULL,
  5990. LLM_NORM, -1);
  5991. cb(cur, "result_norm", -1);
  5992. res->t_embd = cur;
  5993. // lm_head
  5994. cur = build_lora_mm(model.output, cur);
  5995. cb(cur, "result_output", -1);
  5996. res->t_logits = cur;
  5997. ggml_build_forward_expand(gf, cur);
  5998. }
  5999. };
  6000. struct llm_build_starcoder : public llm_graph_context {
  6001. llm_build_starcoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6002. const int64_t n_embd_head = hparams.n_embd_head_v;
  6003. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  6004. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6005. ggml_tensor * cur;
  6006. ggml_tensor * inpL;
  6007. inpL = build_inp_embd(model.tok_embd);
  6008. // inp_pos - contains the positions
  6009. ggml_tensor * inp_pos = build_inp_pos();
  6010. auto * inp_attn = build_attn_inp_kv();
  6011. ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
  6012. cb(pos, "pos_embd", -1);
  6013. inpL = ggml_add(ctx0, inpL, pos);
  6014. cb(inpL, "inpL", -1);
  6015. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6016. for (int il = 0; il < n_layer; ++il) {
  6017. cur = build_norm(inpL,
  6018. model.layers[il].attn_norm,
  6019. model.layers[il].attn_norm_b,
  6020. LLM_NORM, il);
  6021. cb(cur, "attn_norm", il);
  6022. // self-attention
  6023. {
  6024. cur = build_lora_mm(model.layers[il].wqkv, cur);
  6025. cb(cur, "wqkv", il);
  6026. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  6027. cb(cur, "bqkv", il);
  6028. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  6029. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  6030. ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  6031. cb(Qcur, "Qcur", il);
  6032. cb(Kcur, "Kcur", il);
  6033. cb(Vcur, "Vcur", il);
  6034. cur = build_attn(inp_attn,
  6035. model.layers[il].wo, model.layers[il].bo,
  6036. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6037. }
  6038. if (il == n_layer - 1 && inp_out_ids) {
  6039. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6040. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6041. }
  6042. // add the input
  6043. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  6044. cb(ffn_inp, "ffn_inp", il);
  6045. // FF
  6046. {
  6047. cur = build_norm(ffn_inp,
  6048. model.layers[il].ffn_norm,
  6049. model.layers[il].ffn_norm_b,
  6050. LLM_NORM, il);
  6051. cb(cur, "ffn_norm", il);
  6052. cur = build_ffn(cur,
  6053. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  6054. NULL, NULL, NULL,
  6055. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  6056. NULL,
  6057. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  6058. cb(cur, "ffn_out", il);
  6059. }
  6060. cur = ggml_add(ctx0, cur, ffn_inp);
  6061. cur = build_cvec(cur, il);
  6062. cb(cur, "l_out", il);
  6063. // input for next layer
  6064. inpL = cur;
  6065. }
  6066. cur = build_norm(inpL,
  6067. model.output_norm,
  6068. model.output_norm_b,
  6069. LLM_NORM, -1);
  6070. cb(cur, "result_norm", -1);
  6071. res->t_embd = cur;
  6072. cur = build_lora_mm(model.output, cur);
  6073. cb(cur, "result_output", -1);
  6074. res->t_logits = cur;
  6075. ggml_build_forward_expand(gf, cur);
  6076. }
  6077. };
  6078. struct llm_build_refact : public llm_graph_context {
  6079. llm_build_refact(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6080. const int64_t n_embd_head = hparams.n_embd_head_v;
  6081. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6082. ggml_tensor * cur;
  6083. ggml_tensor * inpL;
  6084. inpL = build_inp_embd(model.tok_embd);
  6085. auto * inp_attn = build_attn_inp_kv();
  6086. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6087. for (int il = 0; il < n_layer; ++il) {
  6088. ggml_tensor * inpSA = inpL;
  6089. cur = build_norm(inpL,
  6090. model.layers[il].attn_norm, NULL,
  6091. LLM_NORM_RMS, il);
  6092. cb(cur, "attn_norm", il);
  6093. // self-attention
  6094. {
  6095. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6096. cb(Qcur, "Qcur", il);
  6097. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6098. cb(Kcur, "Kcur", il);
  6099. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6100. cb(Vcur, "Vcur", il);
  6101. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6102. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6103. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6104. cb(Qcur, "Qcur", il);
  6105. cb(Kcur, "Kcur", il);
  6106. cb(Vcur, "Vcur", il);
  6107. cur = build_attn(inp_attn,
  6108. model.layers[il].wo, NULL,
  6109. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6110. }
  6111. if (il == n_layer - 1 && inp_out_ids) {
  6112. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6113. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6114. }
  6115. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6116. cb(ffn_inp, "ffn_inp", il);
  6117. // feed-forward network
  6118. {
  6119. cur = build_norm(ffn_inp,
  6120. model.layers[il].ffn_norm, NULL,
  6121. LLM_NORM_RMS, il);
  6122. cb(cur, "ffn_norm", il);
  6123. cur = build_ffn(cur,
  6124. model.layers[il].ffn_up, NULL, NULL,
  6125. model.layers[il].ffn_gate, NULL, NULL,
  6126. model.layers[il].ffn_down, NULL, NULL,
  6127. NULL,
  6128. LLM_FFN_SILU, LLM_FFN_PAR, il);
  6129. cb(cur, "ffn_out", il);
  6130. }
  6131. cur = ggml_add(ctx0, cur, ffn_inp);
  6132. cur = build_cvec(cur, il);
  6133. cb(cur, "l_out", il);
  6134. // input for next layer
  6135. inpL = cur;
  6136. }
  6137. cur = inpL;
  6138. cur = build_norm(cur,
  6139. model.output_norm, NULL,
  6140. LLM_NORM_RMS, -1);
  6141. cb(cur, "result_norm", -1);
  6142. res->t_embd = cur;
  6143. // lm_head
  6144. cur = build_lora_mm(model.output, cur);
  6145. cb(cur, "result_output", -1);
  6146. res->t_logits = cur;
  6147. ggml_build_forward_expand(gf, cur);
  6148. }
  6149. };
  6150. struct llm_build_bert : public llm_graph_context {
  6151. llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6152. const int64_t n_embd_head = hparams.n_embd_head_v;
  6153. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  6154. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6155. ggml_tensor * cur;
  6156. ggml_tensor * inpL;
  6157. ggml_tensor * inp_pos = nullptr;
  6158. if (model.arch != LLM_ARCH_JINA_BERT_V2) {
  6159. inp_pos = build_inp_pos();
  6160. }
  6161. // construct input embeddings (token, type, position)
  6162. inpL = build_inp_embd(model.tok_embd);
  6163. // token types are hardcoded to zero ("Sentence A")
  6164. if (model.type_embd) {
  6165. ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
  6166. inpL = ggml_add(ctx0, inpL, type_row0);
  6167. }
  6168. if (model.arch == LLM_ARCH_BERT) {
  6169. inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
  6170. }
  6171. cb(inpL, "inp_embd", -1);
  6172. // embed layer norm
  6173. inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
  6174. cb(inpL, "inp_norm", -1);
  6175. auto * inp_attn = build_attn_inp_no_cache();
  6176. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6177. for (int il = 0; il < n_layer; ++il) {
  6178. ggml_tensor * cur = inpL;
  6179. {
  6180. ggml_tensor * Qcur;
  6181. ggml_tensor * Kcur;
  6182. ggml_tensor * Vcur;
  6183. // self-attention
  6184. if (model.layers[il].wqkv) {
  6185. cur = build_lora_mm(model.layers[il].wqkv, cur);
  6186. cb(cur, "wqkv", il);
  6187. if (model.layers[il].bqkv) {
  6188. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  6189. cb(cur, "bqkv", il);
  6190. }
  6191. Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  6192. Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  6193. Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  6194. } else {
  6195. Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
  6196. Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
  6197. Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
  6198. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6199. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6200. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6201. }
  6202. if (model.layers[il].attn_q_norm) {
  6203. Qcur = build_norm(Qcur,
  6204. model.layers[il].attn_q_norm,
  6205. model.layers[il].attn_q_norm_b,
  6206. LLM_NORM, il);
  6207. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6208. }
  6209. if (model.layers[il].attn_k_norm) {
  6210. Kcur = build_norm(Kcur,
  6211. model.layers[il].attn_k_norm,
  6212. model.layers[il].attn_k_norm_b,
  6213. LLM_NORM, il);
  6214. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6215. }
  6216. // RoPE
  6217. if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) {
  6218. Qcur = ggml_rope_ext(
  6219. ctx0, Qcur, inp_pos, nullptr,
  6220. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6221. ext_factor, attn_factor, beta_fast, beta_slow
  6222. );
  6223. Kcur = ggml_rope_ext(
  6224. ctx0, Kcur, inp_pos, nullptr,
  6225. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6226. ext_factor, attn_factor, beta_fast, beta_slow
  6227. );
  6228. }
  6229. cb(Qcur, "Qcur", il);
  6230. cb(Kcur, "Kcur", il);
  6231. cb(Vcur, "Vcur", il);
  6232. cur = build_attn(inp_attn,
  6233. model.layers[il].wo, model.layers[il].bo,
  6234. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6235. cb(cur, "kqv_out", il);
  6236. }
  6237. if (il == n_layer - 1 && inp_out_ids) {
  6238. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6239. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6240. }
  6241. // re-add the layer input
  6242. cur = ggml_add(ctx0, cur, inpL);
  6243. // attention layer norm
  6244. cur = build_norm(cur, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, il);
  6245. if (model.layers[il].attn_norm_2 != nullptr) {
  6246. cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
  6247. cur = build_norm(cur, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, il);
  6248. }
  6249. ggml_tensor * ffn_inp = cur;
  6250. cb(ffn_inp, "ffn_inp", il);
  6251. // feed-forward network
  6252. if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
  6253. // MoE branch
  6254. cur = build_moe_ffn(cur,
  6255. model.layers[il].ffn_gate_inp,
  6256. model.layers[il].ffn_up_exps,
  6257. nullptr,
  6258. model.layers[il].ffn_down_exps,
  6259. nullptr,
  6260. hparams.n_expert,
  6261. hparams.n_expert_used,
  6262. LLM_FFN_GELU,
  6263. false, false,
  6264. 0.0f,
  6265. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
  6266. cb(cur, "ffn_moe_out", il);
  6267. } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) {
  6268. cur = build_ffn(cur,
  6269. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  6270. NULL, NULL, NULL,
  6271. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  6272. NULL,
  6273. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  6274. cb(cur, "ffn_out", il);
  6275. } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
  6276. cur = build_ffn(cur,
  6277. model.layers[il].ffn_up, NULL, NULL,
  6278. model.layers[il].ffn_gate, NULL, NULL,
  6279. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  6280. NULL,
  6281. model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
  6282. cb(cur, "ffn_out", il);
  6283. } else {
  6284. cur = build_ffn(cur,
  6285. model.layers[il].ffn_up, NULL, NULL,
  6286. model.layers[il].ffn_gate, NULL, NULL,
  6287. model.layers[il].ffn_down, NULL, NULL,
  6288. NULL,
  6289. LLM_FFN_SILU, LLM_FFN_PAR, il);
  6290. cb(cur, "ffn_out", il);
  6291. }
  6292. // attentions bypass the intermediate layer
  6293. cur = ggml_add(ctx0, cur, ffn_inp);
  6294. // output layer norm
  6295. cur = build_norm(cur, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, il);
  6296. // input for next layer
  6297. inpL = cur;
  6298. }
  6299. cur = inpL;
  6300. cb(cur, "result_embd", -1);
  6301. res->t_embd = cur;
  6302. ggml_build_forward_expand(gf, cur);
  6303. }
  6304. };
  6305. struct llm_build_neo_bert : public llm_graph_context {
  6306. llm_build_neo_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6307. const int64_t n_embd_head = hparams.n_embd_head_v;
  6308. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  6309. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6310. ggml_tensor * cur;
  6311. ggml_tensor * inpL;
  6312. ggml_tensor * inp_pos = build_inp_pos();
  6313. // construct input embeddings (token, type, position)
  6314. inpL = build_inp_embd(model.tok_embd);
  6315. cb(inpL, "inp_embd", -1);
  6316. auto * inp_attn = build_attn_inp_no_cache();
  6317. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6318. for (int il = 0; il < n_layer; ++il) {
  6319. ggml_tensor * cur = inpL;
  6320. // pre-norm
  6321. cur = build_norm(inpL,
  6322. model.layers[il].attn_norm, NULL,
  6323. LLM_NORM_RMS, il);
  6324. {
  6325. ggml_tensor * Qcur;
  6326. ggml_tensor * Kcur;
  6327. ggml_tensor * Vcur;
  6328. // self-attention
  6329. cur = build_lora_mm(model.layers[il].wqkv, cur);
  6330. cb(cur, "wqkv", il);
  6331. Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  6332. Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  6333. Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  6334. // RoPE
  6335. Qcur = ggml_rope_ext(
  6336. ctx0, Qcur, inp_pos, nullptr,
  6337. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6338. ext_factor, attn_factor, beta_fast, beta_slow
  6339. );
  6340. Kcur = ggml_rope_ext(
  6341. ctx0, Kcur, inp_pos, nullptr,
  6342. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6343. ext_factor, attn_factor, beta_fast, beta_slow
  6344. );
  6345. cb(Qcur, "Qcur", il);
  6346. cb(Kcur, "Kcur", il);
  6347. cb(Vcur, "Vcur", il);
  6348. cur = build_attn(inp_attn,
  6349. model.layers[il].wo, nullptr,
  6350. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6351. cb(cur, "kqv_out", il);
  6352. }
  6353. if (il == n_layer - 1 && inp_out_ids) {
  6354. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6355. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6356. }
  6357. // re-add the layer input
  6358. cur = ggml_add(ctx0, cur, inpL);
  6359. ggml_tensor * ffn_inp = cur;
  6360. cb(ffn_inp, "ffn_inp", il);
  6361. // pre-norm
  6362. cur = build_norm(ffn_inp,
  6363. model.layers[il].ffn_norm, NULL,
  6364. LLM_NORM_RMS, il);
  6365. cb(cur, "ffn_norm", il);
  6366. // feed-forward network
  6367. cur = build_ffn(cur,
  6368. model.layers[il].ffn_up,
  6369. NULL, NULL, NULL, NULL, NULL,
  6370. model.layers[il].ffn_down,
  6371. NULL, NULL, NULL,
  6372. LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
  6373. // attentions bypass the intermediate layer
  6374. cur = ggml_add(ctx0, cur, ffn_inp);
  6375. // input for next layer
  6376. inpL = cur;
  6377. }
  6378. cur = inpL;
  6379. cur = build_norm(cur,
  6380. model.output_norm_enc, NULL,
  6381. LLM_NORM_RMS, -1);
  6382. cb(cur, "result_embd", -1);
  6383. res->t_embd = cur;
  6384. ggml_build_forward_expand(gf, cur);
  6385. }
  6386. };
  6387. struct llm_build_bloom : public llm_graph_context {
  6388. llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6389. const int64_t n_embd_head = hparams.n_embd_head_v;
  6390. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  6391. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6392. ggml_tensor * cur;
  6393. ggml_tensor * inpL;
  6394. inpL = build_inp_embd(model.tok_embd);
  6395. auto * inp_attn = build_attn_inp_kv();
  6396. inpL = build_norm(inpL,
  6397. model.tok_norm,
  6398. model.tok_norm_b,
  6399. LLM_NORM, -1);
  6400. cb(inpL, "inp_norm", -1);
  6401. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6402. for (int il = 0; il < n_layer; ++il) {
  6403. cur = build_norm(inpL,
  6404. model.layers[il].attn_norm,
  6405. model.layers[il].attn_norm_b,
  6406. LLM_NORM, il);
  6407. cb(cur, "attn_norm", il);
  6408. // self-attention
  6409. {
  6410. cur = build_lora_mm(model.layers[il].wqkv, cur);
  6411. cb(cur, "wqkv", il);
  6412. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  6413. cb(cur, "bqkv", il);
  6414. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  6415. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  6416. ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  6417. cb(Qcur, "Qcur", il);
  6418. cb(Kcur, "Kcur", il);
  6419. cb(Vcur, "Vcur", il);
  6420. cur = build_attn(inp_attn,
  6421. model.layers[il].wo, model.layers[il].bo,
  6422. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6423. }
  6424. if (il == n_layer - 1 && inp_out_ids) {
  6425. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6426. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6427. }
  6428. // Add the input
  6429. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  6430. cb(ffn_inp, "ffn_inp", il);
  6431. // FF
  6432. {
  6433. cur = build_norm(ffn_inp,
  6434. model.layers[il].ffn_norm,
  6435. model.layers[il].ffn_norm_b,
  6436. LLM_NORM, il);
  6437. cb(cur, "ffn_norm", il);
  6438. cur = build_ffn(cur,
  6439. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  6440. NULL, NULL, NULL,
  6441. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  6442. NULL,
  6443. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  6444. cb(cur, "ffn_out", il);
  6445. }
  6446. cur = ggml_add(ctx0, cur, ffn_inp);
  6447. cur = build_cvec(cur, il);
  6448. cb(cur, "l_out", il);
  6449. // input for next layer
  6450. inpL = cur;
  6451. }
  6452. cur = build_norm(inpL,
  6453. model.output_norm,
  6454. model.output_norm_b,
  6455. LLM_NORM, -1);
  6456. cb(cur, "result_norm", -1);
  6457. res->t_embd = cur;
  6458. cur = build_lora_mm(model.output, cur);
  6459. cb(cur, "result_output", -1);
  6460. res->t_logits = cur;
  6461. ggml_build_forward_expand(gf, cur);
  6462. }
  6463. };
  6464. struct llm_build_mpt : public llm_graph_context {
  6465. llm_build_mpt(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6466. const int64_t n_embd_head = hparams.n_embd_head_v;
  6467. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  6468. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6469. ggml_tensor * cur;
  6470. ggml_tensor * pos;
  6471. ggml_tensor * inpL;
  6472. inpL = build_inp_embd(model.tok_embd);
  6473. auto * inp_attn = build_attn_inp_kv();
  6474. if (model.pos_embd) {
  6475. // inp_pos - contains the positions
  6476. ggml_tensor * inp_pos = build_inp_pos();
  6477. pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
  6478. cb(pos, "pos_embd", -1);
  6479. inpL = ggml_add(ctx0, inpL, pos);
  6480. cb(inpL, "inpL", -1);
  6481. }
  6482. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6483. for (int il = 0; il < n_layer; ++il) {
  6484. ggml_tensor * attn_norm;
  6485. attn_norm = build_norm(inpL,
  6486. model.layers[il].attn_norm,
  6487. model.layers[il].attn_norm_b,
  6488. LLM_NORM, il);
  6489. cb(attn_norm, "attn_norm", il);
  6490. // self-attention
  6491. {
  6492. cur = attn_norm;
  6493. cur = build_lora_mm(model.layers[il].wqkv, cur);
  6494. cb(cur, "wqkv", il);
  6495. if (model.layers[il].bqkv){
  6496. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  6497. cb(cur, "bqkv", il);
  6498. }
  6499. if (hparams.f_clamp_kqv > 0.0f) {
  6500. cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  6501. cb(cur, "wqkv_clamped", il);
  6502. }
  6503. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  6504. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  6505. ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  6506. // Q/K Layernorm
  6507. if (model.layers[il].attn_q_norm) {
  6508. Qcur = build_norm(Qcur,
  6509. model.layers[il].attn_q_norm,
  6510. model.layers[il].attn_q_norm_b,
  6511. LLM_NORM, il);
  6512. Kcur = build_norm(Kcur,
  6513. model.layers[il].attn_k_norm,
  6514. model.layers[il].attn_k_norm_b,
  6515. LLM_NORM, il);
  6516. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6517. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6518. }
  6519. cb(Qcur, "Qcur", il);
  6520. cb(Kcur, "Kcur", il);
  6521. cb(Vcur, "Vcur", il);
  6522. cur = build_attn(inp_attn,
  6523. model.layers[il].wo, model.layers[il].bo,
  6524. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6525. }
  6526. if (il == n_layer - 1 && inp_out_ids) {
  6527. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6528. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6529. }
  6530. // Add the input
  6531. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  6532. cb(ffn_inp, "ffn_inp", il);
  6533. // feed forward
  6534. {
  6535. cur = build_norm(ffn_inp,
  6536. model.layers[il].ffn_norm,
  6537. model.layers[il].ffn_norm_b,
  6538. LLM_NORM, il);
  6539. cb(cur, "ffn_norm", il);
  6540. cur = build_ffn(cur,
  6541. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  6542. NULL, NULL, NULL,
  6543. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  6544. model.layers[il].ffn_act,
  6545. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  6546. cb(cur, "ffn_out", il);
  6547. }
  6548. cur = ggml_add(ctx0, cur, ffn_inp);
  6549. cur = build_cvec(cur, il);
  6550. cb(cur, "l_out", il);
  6551. // input for next layer
  6552. inpL = cur;
  6553. }
  6554. cur = inpL;
  6555. cur = build_norm(cur,
  6556. model.output_norm,
  6557. model.output_norm_b,
  6558. LLM_NORM, -1);
  6559. cb(cur, "result_norm", -1);
  6560. res->t_embd = cur;
  6561. cur = build_lora_mm(model.output, cur);
  6562. cb(cur, "result_output", -1);
  6563. res->t_logits = cur;
  6564. ggml_build_forward_expand(gf, cur);
  6565. }
  6566. };
  6567. struct llm_build_stablelm : public llm_graph_context {
  6568. llm_build_stablelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6569. const int64_t n_embd_head = hparams.n_embd_head_v;
  6570. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6571. ggml_tensor * cur;
  6572. ggml_tensor * inpL;
  6573. inpL = build_inp_embd(model.tok_embd);
  6574. // inp_pos - contains the positions
  6575. ggml_tensor * inp_pos = build_inp_pos();
  6576. auto * inp_attn = build_attn_inp_kv();
  6577. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6578. for (int il = 0; il < n_layer; ++il) {
  6579. // norm
  6580. cur = build_norm(inpL,
  6581. model.layers[il].attn_norm,
  6582. model.layers[il].attn_norm_b,
  6583. LLM_NORM, il);
  6584. cb(cur, "attn_norm", il);
  6585. ggml_tensor * inpSA = cur;
  6586. // self-attention
  6587. {
  6588. // compute Q and K and RoPE them
  6589. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6590. cb(Qcur, "Qcur", il);
  6591. if (model.layers[il].bq) {
  6592. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  6593. cb(Qcur, "Qcur", il);
  6594. }
  6595. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6596. cb(Kcur, "Kcur", il);
  6597. if (model.layers[il].bk) {
  6598. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  6599. cb(Kcur, "Kcur", il);
  6600. }
  6601. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6602. cb(Vcur, "Vcur", il);
  6603. if (model.layers[il].bv) {
  6604. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  6605. cb(Vcur, "Vcur", il);
  6606. }
  6607. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6608. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6609. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6610. if (model.layers[il].attn_q_norm) {
  6611. Qcur = build_norm(Qcur,
  6612. model.layers[il].attn_q_norm,
  6613. NULL,
  6614. LLM_NORM, il);
  6615. cb(Qcur, "Qcur", il);
  6616. }
  6617. if (model.layers[il].attn_k_norm) {
  6618. Kcur = build_norm(Kcur,
  6619. model.layers[il].attn_k_norm,
  6620. NULL,
  6621. LLM_NORM, il);
  6622. cb(Kcur, "Kcur", il);
  6623. }
  6624. Qcur = ggml_rope_ext(
  6625. ctx0, Qcur, inp_pos, nullptr,
  6626. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6627. ext_factor, attn_factor, beta_fast, beta_slow
  6628. );
  6629. Kcur = ggml_rope_ext(
  6630. ctx0, Kcur, inp_pos, nullptr,
  6631. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6632. ext_factor, attn_factor, beta_fast, beta_slow
  6633. );
  6634. cb(Qcur, "Qcur", il);
  6635. cb(Kcur, "Kcur", il);
  6636. cb(Vcur, "Vcur", il);
  6637. cur = build_attn(inp_attn,
  6638. model.layers[il].wo, NULL,
  6639. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6640. }
  6641. if (il == n_layer - 1 && inp_out_ids) {
  6642. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6643. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6644. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6645. }
  6646. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  6647. cb(ffn_inp, "ffn_inp", il);
  6648. // feed-forward network
  6649. {
  6650. if (model.layers[il].ffn_norm) {
  6651. cur = build_norm(ffn_inp,
  6652. model.layers[il].ffn_norm,
  6653. model.layers[il].ffn_norm_b,
  6654. LLM_NORM, il);
  6655. cb(cur, "ffn_norm", il);
  6656. } else {
  6657. // parallel residual
  6658. cur = inpSA;
  6659. }
  6660. cur = build_ffn(cur,
  6661. model.layers[il].ffn_up, NULL, NULL,
  6662. model.layers[il].ffn_gate, NULL, NULL,
  6663. model.layers[il].ffn_down, NULL, NULL,
  6664. NULL,
  6665. LLM_FFN_SILU, LLM_FFN_PAR, il);
  6666. cb(cur, "ffn_out", il);
  6667. }
  6668. cur = ggml_add(ctx0, cur, ffn_inp);
  6669. cur = build_cvec(cur, il);
  6670. cb(cur, "l_out", il);
  6671. // input for next layer
  6672. inpL = cur;
  6673. }
  6674. cur = inpL;
  6675. cur = build_norm(cur,
  6676. model.output_norm,
  6677. model.output_norm_b,
  6678. LLM_NORM, -1);
  6679. cb(cur, "result_norm", -1);
  6680. res->t_embd = cur;
  6681. // lm_head
  6682. cur = build_lora_mm(model.output, cur);
  6683. cb(cur, "result_output", -1);
  6684. res->t_logits = cur;
  6685. ggml_build_forward_expand(gf, cur);
  6686. }
  6687. };
  6688. struct llm_build_qwen : public llm_graph_context {
  6689. llm_build_qwen(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6690. const int64_t n_embd_head = hparams.n_embd_head_v;
  6691. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6692. ggml_tensor * cur;
  6693. ggml_tensor * inpL;
  6694. inpL = build_inp_embd(model.tok_embd);
  6695. // inp_pos - contains the positions
  6696. ggml_tensor * inp_pos = build_inp_pos();
  6697. auto * inp_attn = build_attn_inp_kv();
  6698. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6699. for (int il = 0; il < n_layer; ++il) {
  6700. ggml_tensor * inpSA = inpL;
  6701. cur = build_norm(inpL,
  6702. model.layers[il].attn_norm, NULL,
  6703. LLM_NORM_RMS, il);
  6704. cb(cur, "attn_norm", il);
  6705. // self-attention
  6706. {
  6707. cur = build_lora_mm(model.layers[il].wqkv, cur);
  6708. cb(cur, "wqkv", il);
  6709. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  6710. cb(cur, "bqkv", il);
  6711. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  6712. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  6713. ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 2*sizeof(float)*(n_embd));
  6714. // using mode = 2 for neox mode
  6715. Qcur = ggml_rope_ext(
  6716. ctx0, Qcur, inp_pos, nullptr,
  6717. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6718. ext_factor, attn_factor, beta_fast, beta_slow
  6719. );
  6720. Kcur = ggml_rope_ext(
  6721. ctx0, Kcur, inp_pos, nullptr,
  6722. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6723. ext_factor, attn_factor, beta_fast, beta_slow
  6724. );
  6725. cb(Qcur, "Qcur", il);
  6726. cb(Kcur, "Kcur", il);
  6727. cb(Vcur, "Vcur", il);
  6728. cur = build_attn(inp_attn,
  6729. model.layers[il].wo, NULL,
  6730. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6731. }
  6732. if (il == n_layer - 1 && inp_out_ids) {
  6733. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6734. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6735. }
  6736. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6737. cb(ffn_inp, "ffn_inp", il);
  6738. // feed-forward forward
  6739. {
  6740. cur = build_norm(ffn_inp,
  6741. model.layers[il].ffn_norm, NULL,
  6742. LLM_NORM_RMS, il);
  6743. cb(cur, "ffn_norm", il);
  6744. cur = build_ffn(cur,
  6745. model.layers[il].ffn_up, NULL, NULL,
  6746. model.layers[il].ffn_gate, NULL, NULL,
  6747. model.layers[il].ffn_down, NULL, NULL,
  6748. NULL,
  6749. LLM_FFN_SILU, LLM_FFN_PAR, il);
  6750. cb(cur, "ffn_out", il);
  6751. }
  6752. cur = ggml_add(ctx0, cur, ffn_inp);
  6753. cur = build_cvec(cur, il);
  6754. cb(cur, "l_out", il);
  6755. // input for next layer
  6756. inpL = cur;
  6757. }
  6758. cur = inpL;
  6759. cur = build_norm(cur,
  6760. model.output_norm, NULL,
  6761. LLM_NORM_RMS, -1);
  6762. cb(cur, "result_norm", -1);
  6763. res->t_embd = cur;
  6764. // lm_head
  6765. cur = build_lora_mm(model.output, cur);
  6766. cb(cur, "result_output", -1);
  6767. res->t_logits = cur;
  6768. ggml_build_forward_expand(gf, cur);
  6769. }
  6770. };
  6771. struct llm_build_qwen2 : public llm_graph_context {
  6772. llm_build_qwen2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6773. const int64_t n_embd_head = hparams.n_embd_head_v;
  6774. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6775. GGML_ASSERT(n_embd_head == hparams.n_rot);
  6776. ggml_tensor * cur;
  6777. ggml_tensor * inpL;
  6778. inpL = build_inp_embd(model.tok_embd);
  6779. // inp_pos - contains the positions
  6780. ggml_tensor * inp_pos = build_inp_pos();
  6781. auto * inp_attn = build_attn_inp_kv();
  6782. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6783. for (int il = 0; il < n_layer; ++il) {
  6784. ggml_tensor * inpSA = inpL;
  6785. // norm
  6786. cur = build_norm(inpL,
  6787. model.layers[il].attn_norm, NULL,
  6788. LLM_NORM_RMS, il);
  6789. cb(cur, "attn_norm", il);
  6790. // self-attention
  6791. {
  6792. // compute Q and K and RoPE them
  6793. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6794. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  6795. cb(Qcur, "Qcur", il);
  6796. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6797. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  6798. cb(Kcur, "Kcur", il);
  6799. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6800. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  6801. cb(Vcur, "Vcur", il);
  6802. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6803. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6804. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6805. Qcur = ggml_rope_ext(
  6806. ctx0, Qcur, inp_pos, nullptr,
  6807. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6808. ext_factor, attn_factor, beta_fast, beta_slow
  6809. );
  6810. Kcur = ggml_rope_ext(
  6811. ctx0, Kcur, inp_pos, nullptr,
  6812. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6813. ext_factor, attn_factor, beta_fast, beta_slow
  6814. );
  6815. cb(Qcur, "Qcur", il);
  6816. cb(Kcur, "Kcur", il);
  6817. cb(Vcur, "Vcur", il);
  6818. cur = build_attn(inp_attn,
  6819. model.layers[il].wo, model.layers[il].bo,
  6820. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6821. }
  6822. if (il == n_layer - 1 && inp_out_ids) {
  6823. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6824. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6825. }
  6826. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6827. cb(ffn_inp, "ffn_inp", il);
  6828. // feed-forward network
  6829. cur = build_norm(ffn_inp,
  6830. model.layers[il].ffn_norm, NULL,
  6831. LLM_NORM_RMS, il);
  6832. cb(cur, "ffn_norm", il);
  6833. cur = build_ffn(cur,
  6834. model.layers[il].ffn_up, NULL, NULL,
  6835. model.layers[il].ffn_gate, NULL, NULL,
  6836. model.layers[il].ffn_down, NULL, NULL,
  6837. NULL,
  6838. LLM_FFN_SILU, LLM_FFN_PAR, il);
  6839. cb(cur, "ffn_out", il);
  6840. cur = ggml_add(ctx0, cur, ffn_inp);
  6841. cur = build_cvec(cur, il);
  6842. cb(cur, "l_out", il);
  6843. // input for next layer
  6844. inpL = cur;
  6845. }
  6846. cur = inpL;
  6847. cur = build_norm(cur,
  6848. model.output_norm, NULL,
  6849. LLM_NORM_RMS, -1);
  6850. cb(cur, "result_norm", -1);
  6851. res->t_embd = cur;
  6852. // lm_head
  6853. cur = build_lora_mm(model.output, cur);
  6854. if (model.output_b != nullptr) {
  6855. cur = ggml_add(ctx0, cur, model.output_b);
  6856. }
  6857. cb(cur, "result_output", -1);
  6858. res->t_logits = cur;
  6859. ggml_build_forward_expand(gf, cur);
  6860. }
  6861. };
  6862. struct llm_build_dream : public llm_graph_context {
  6863. llm_build_dream(const llama_model & model, const llm_graph_params & params) :
  6864. llm_graph_context(params) {
  6865. //copied from qwen2
  6866. const int64_t n_embd_head = hparams.n_embd_head_v;
  6867. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6868. GGML_ASSERT(n_embd_head == hparams.n_rot);
  6869. ggml_tensor * cur;
  6870. ggml_tensor * inpL;
  6871. inpL = build_inp_embd(model.tok_embd);
  6872. // inp_pos - contains the positions
  6873. ggml_tensor * inp_pos = build_inp_pos();
  6874. auto * inp_attn = build_attn_inp_no_cache();
  6875. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6876. for (int il = 0; il < n_layer; ++il) {
  6877. ggml_tensor * inpSA = inpL;
  6878. // norm
  6879. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  6880. cb(cur, "attn_norm", il);
  6881. // self-attention
  6882. {
  6883. // compute Q and K and RoPE them
  6884. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6885. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  6886. cb(Qcur, "Qcur", il);
  6887. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6888. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  6889. cb(Kcur, "Kcur", il);
  6890. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6891. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  6892. cb(Vcur, "Vcur", il);
  6893. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6894. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6895. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6896. Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6897. ext_factor, attn_factor, beta_fast, beta_slow);
  6898. Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6899. ext_factor, attn_factor, beta_fast, beta_slow);
  6900. cb(Qcur, "Qcur", il);
  6901. cb(Kcur, "Kcur", il);
  6902. cb(Vcur, "Vcur", il);
  6903. cur = build_attn(inp_attn,
  6904. model.layers[il].wo, model.layers[il].bo,
  6905. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
  6906. }
  6907. if (il == n_layer - 1 && inp_out_ids) {
  6908. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6909. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6910. }
  6911. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6912. cb(ffn_inp, "ffn_inp", il);
  6913. // feed-forward network
  6914. cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
  6915. cb(cur, "ffn_norm", il);
  6916. cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
  6917. model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
  6918. cb(cur, "ffn_out", il);
  6919. cur = ggml_add(ctx0, cur, ffn_inp);
  6920. cur = build_cvec(cur, il);
  6921. cb(cur, "l_out", il);
  6922. // input for next layer
  6923. inpL = cur;
  6924. }
  6925. cur = inpL;
  6926. cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
  6927. cb(cur, "result_norm", -1);
  6928. res->t_embd = cur;
  6929. // lm_head
  6930. cur = build_lora_mm(model.output, cur);
  6931. cb(cur, "result_output", -1);
  6932. res->t_logits = cur;
  6933. ggml_build_forward_expand(gf, cur);
  6934. }
  6935. };
  6936. struct llm_build_llada : public llm_graph_context {
  6937. llm_build_llada(const llama_model & model, const llm_graph_params & params) :
  6938. llm_graph_context(params) {
  6939. // LLaDA is similar to LLaMA but uses non-causal attention for diffusion
  6940. const int64_t n_embd_head = hparams.n_embd_head_v;
  6941. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6942. GGML_ASSERT(n_embd_head == hparams.n_rot);
  6943. ggml_tensor * cur;
  6944. ggml_tensor * inpL;
  6945. inpL = build_inp_embd(model.tok_embd);
  6946. // inp_pos - contains the positions
  6947. ggml_tensor * inp_pos = build_inp_pos();
  6948. // Non-causal attention for diffusion
  6949. auto * inp_attn = build_attn_inp_no_cache();
  6950. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6951. for (int il = 0; il < n_layer; ++il) {
  6952. ggml_tensor * inpSA = inpL;
  6953. // norm
  6954. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  6955. cb(cur, "attn_norm", il);
  6956. // self-attention
  6957. {
  6958. // compute separate Q, K, V projections without bias, matching LLaDALlamaBlock
  6959. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6960. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6961. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6962. cb(Qcur, "Qcur", il);
  6963. cb(Kcur, "Kcur", il);
  6964. cb(Vcur, "Vcur", il);
  6965. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6966. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6967. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6968. Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6969. ext_factor, attn_factor, beta_fast, beta_slow);
  6970. Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6971. ext_factor, attn_factor, beta_fast, beta_slow);
  6972. cb(Qcur, "Qcur", il);
  6973. cb(Kcur, "Kcur", il);
  6974. cb(Vcur, "Vcur", il);
  6975. cur = build_attn(inp_attn,
  6976. model.layers[il].wo, NULL,
  6977. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
  6978. }
  6979. if (il == n_layer - 1 && inp_out_ids) {
  6980. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6981. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6982. }
  6983. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6984. cb(ffn_inp, "ffn_inp", il);
  6985. // feed-forward network
  6986. cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
  6987. cb(cur, "ffn_norm", il);
  6988. cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
  6989. model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
  6990. cb(cur, "ffn_out", il);
  6991. cur = ggml_add(ctx0, cur, ffn_inp);
  6992. cur = build_cvec(cur, il);
  6993. cb(cur, "l_out", il);
  6994. // input for next layer
  6995. inpL = cur;
  6996. }
  6997. cur = inpL;
  6998. cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
  6999. cb(cur, "result_norm", -1);
  7000. res->t_embd = cur;
  7001. // lm_head
  7002. cur = build_lora_mm(model.output, cur);
  7003. cb(cur, "result_output", -1);
  7004. res->t_logits = cur;
  7005. ggml_build_forward_expand(gf, cur);
  7006. }
  7007. };
  7008. struct llm_build_qwen2vl : public llm_graph_context {
  7009. llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7010. const int64_t n_embd_head = hparams.n_embd_head_v;
  7011. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7012. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7013. ggml_tensor * cur;
  7014. ggml_tensor * inpL;
  7015. inpL = build_inp_embd(model.tok_embd);
  7016. // inp_pos - contains the positions
  7017. ggml_tensor * inp_pos = build_inp_pos();
  7018. auto * inp_attn = build_attn_inp_kv();
  7019. int sections[4];
  7020. std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
  7021. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7022. for (int il = 0; il < n_layer; ++il) {
  7023. ggml_tensor * inpSA = inpL;
  7024. // norm
  7025. cur = build_norm(inpL,
  7026. model.layers[il].attn_norm, NULL,
  7027. LLM_NORM_RMS, il);
  7028. cb(cur, "attn_norm", il);
  7029. // self-attention
  7030. {
  7031. // compute Q and K and RoPE them
  7032. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7033. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  7034. cb(Qcur, "Qcur", il);
  7035. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7036. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  7037. cb(Kcur, "Kcur", il);
  7038. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7039. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  7040. cb(Vcur, "Vcur", il);
  7041. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7042. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7043. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7044. Qcur = ggml_rope_multi(
  7045. ctx0, Qcur, inp_pos, nullptr,
  7046. n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
  7047. ext_factor, attn_factor, beta_fast, beta_slow
  7048. );
  7049. Kcur = ggml_rope_multi(
  7050. ctx0, Kcur, inp_pos, nullptr,
  7051. n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
  7052. ext_factor, attn_factor, beta_fast, beta_slow
  7053. );
  7054. cb(Qcur, "Qcur", il);
  7055. cb(Kcur, "Kcur", il);
  7056. cb(Vcur, "Vcur", il);
  7057. cur = build_attn(inp_attn,
  7058. model.layers[il].wo, model.layers[il].bo,
  7059. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7060. }
  7061. if (il == n_layer - 1 && inp_out_ids) {
  7062. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7063. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7064. }
  7065. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7066. cb(ffn_inp, "ffn_inp", il);
  7067. // feed-forward network
  7068. cur = build_norm(ffn_inp,
  7069. model.layers[il].ffn_norm, NULL,
  7070. LLM_NORM_RMS, il);
  7071. cb(cur, "ffn_norm", il);
  7072. cur = build_ffn(cur,
  7073. model.layers[il].ffn_up, NULL, NULL,
  7074. model.layers[il].ffn_gate, NULL, NULL,
  7075. model.layers[il].ffn_down, NULL, NULL,
  7076. NULL,
  7077. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7078. cb(cur, "ffn_out", il);
  7079. cur = ggml_add(ctx0, cur, ffn_inp);
  7080. cur = build_cvec(cur, il);
  7081. cb(cur, "l_out", il);
  7082. // input for next layer
  7083. inpL = cur;
  7084. }
  7085. cur = inpL;
  7086. cur = build_norm(cur,
  7087. model.output_norm, NULL,
  7088. LLM_NORM_RMS, -1);
  7089. cb(cur, "result_norm", -1);
  7090. res->t_embd = cur;
  7091. // lm_head
  7092. cur = build_lora_mm(model.output, cur);
  7093. cb(cur, "result_output", -1);
  7094. res->t_logits = cur;
  7095. ggml_build_forward_expand(gf, cur);
  7096. }
  7097. };
  7098. struct llm_build_qwen2moe : public llm_graph_context {
  7099. llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7100. const int64_t n_embd_head = hparams.n_embd_head_v;
  7101. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7102. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7103. ggml_tensor * cur;
  7104. ggml_tensor * inpL;
  7105. inpL = build_inp_embd(model.tok_embd);
  7106. // inp_pos - contains the positions
  7107. ggml_tensor * inp_pos = build_inp_pos();
  7108. auto * inp_attn = build_attn_inp_kv();
  7109. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7110. for (int il = 0; il < n_layer; ++il) {
  7111. ggml_tensor * inpSA = inpL;
  7112. // norm
  7113. cur = build_norm(inpL,
  7114. model.layers[il].attn_norm, NULL,
  7115. LLM_NORM_RMS, il);
  7116. cb(cur, "attn_norm", il);
  7117. // self_attention
  7118. {
  7119. // compute Q and K and RoPE them
  7120. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7121. cb(Qcur, "Qcur", il);
  7122. if (model.layers[il].bq) {
  7123. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  7124. cb(Qcur, "Qcur", il);
  7125. }
  7126. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7127. cb(Kcur, "Kcur", il);
  7128. if (model.layers[il].bk) {
  7129. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  7130. cb(Kcur, "Kcur", il);
  7131. }
  7132. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7133. cb(Vcur, "Vcur", il);
  7134. if (model.layers[il].bv) {
  7135. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  7136. cb(Vcur, "Vcur", il);
  7137. }
  7138. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7139. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7140. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7141. Qcur = ggml_rope_ext(
  7142. ctx0, Qcur, inp_pos, nullptr,
  7143. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7144. ext_factor, attn_factor, beta_fast, beta_slow
  7145. );
  7146. Kcur = ggml_rope_ext(
  7147. ctx0, Kcur, inp_pos, nullptr,
  7148. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7149. ext_factor, attn_factor, beta_fast, beta_slow
  7150. );
  7151. cb(Qcur, "Qcur", il);
  7152. cb(Kcur, "Kcur", il);
  7153. cb(Vcur, "Vcur", il);
  7154. cur = build_attn(inp_attn,
  7155. model.layers[il].wo, model.layers[il].bo,
  7156. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7157. }
  7158. if (il == n_layer - 1 && inp_out_ids) {
  7159. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7160. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7161. }
  7162. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7163. cb(ffn_inp, "ffn_inp", il);
  7164. // MoE branch
  7165. cur = build_norm(ffn_inp,
  7166. model.layers[il].ffn_norm, NULL,
  7167. LLM_NORM_RMS, il);
  7168. cb(cur, "ffn_norm", il);
  7169. ggml_tensor * moe_out =
  7170. build_moe_ffn(cur,
  7171. model.layers[il].ffn_gate_inp,
  7172. model.layers[il].ffn_up_exps,
  7173. model.layers[il].ffn_gate_exps,
  7174. model.layers[il].ffn_down_exps,
  7175. nullptr,
  7176. n_expert, n_expert_used,
  7177. LLM_FFN_SILU, false,
  7178. false, 0.0,
  7179. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  7180. il);
  7181. cb(moe_out, "ffn_moe_out", il);
  7182. // FFN shared expert
  7183. {
  7184. ggml_tensor * cur_gate_inp = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur);
  7185. cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
  7186. // sigmoid
  7187. ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
  7188. cb(cur_gate, "ffn_shexp_gate", il);
  7189. ggml_tensor * cur_ffn = build_ffn(cur,
  7190. model.layers[il].ffn_up_shexp, NULL, NULL,
  7191. model.layers[il].ffn_gate_shexp, NULL, NULL,
  7192. model.layers[il].ffn_down_shexp, NULL, NULL,
  7193. NULL,
  7194. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7195. cb(cur_ffn, "ffn_shexp", il);
  7196. ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
  7197. cb(ffn_shexp_out, "ffn_shexp_out", il);
  7198. moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
  7199. cb(moe_out, "ffn_out", il);
  7200. cur = moe_out;
  7201. }
  7202. cur = ggml_add(ctx0, cur, ffn_inp);
  7203. cur = build_cvec(cur, il);
  7204. cb(cur, "l_out", il);
  7205. // input for next layer
  7206. inpL = cur;
  7207. }
  7208. cur = inpL;
  7209. cur = build_norm(cur,
  7210. model.output_norm, NULL,
  7211. LLM_NORM_RMS, -1);
  7212. cb(cur, "result_norm", -1);
  7213. res->t_embd = cur;
  7214. // lm_head
  7215. cur = build_lora_mm(model.output, cur);
  7216. cb(cur, "result_output", -1);
  7217. res->t_logits = cur;
  7218. ggml_build_forward_expand(gf, cur);
  7219. }
  7220. };
  7221. struct llm_build_qwen3 : public llm_graph_context {
  7222. llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7223. const int64_t n_embd_head = hparams.n_embd_head_v;
  7224. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7225. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7226. ggml_tensor * cur;
  7227. ggml_tensor * inpL;
  7228. inpL = build_inp_embd(model.tok_embd);
  7229. // inp_pos - contains the positions
  7230. ggml_tensor * inp_pos = build_inp_pos();
  7231. auto * inp_attn = build_attn_inp_kv();
  7232. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7233. for (int il = 0; il < n_layer; ++il) {
  7234. ggml_tensor * inpSA = inpL;
  7235. // norm
  7236. cur = build_norm(inpL,
  7237. model.layers[il].attn_norm, NULL,
  7238. LLM_NORM_RMS, il);
  7239. cb(cur, "attn_norm", il);
  7240. // self-attention
  7241. {
  7242. // compute Q and K and RoPE them
  7243. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7244. cb(Qcur, "Qcur", il);
  7245. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7246. cb(Kcur, "Kcur", il);
  7247. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7248. cb(Vcur, "Vcur", il);
  7249. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7250. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7251. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7252. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  7253. cb(Qcur, "Qcur_normed", il);
  7254. Qcur = ggml_rope_ext(
  7255. ctx0, Qcur, inp_pos, nullptr,
  7256. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7257. ext_factor, attn_factor, beta_fast, beta_slow
  7258. );
  7259. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  7260. cb(Kcur, "Kcur_normed", il);
  7261. Kcur = ggml_rope_ext(
  7262. ctx0, Kcur, inp_pos, nullptr,
  7263. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7264. ext_factor, attn_factor, beta_fast, beta_slow
  7265. );
  7266. cb(Qcur, "Qcur", il);
  7267. cb(Kcur, "Kcur", il);
  7268. cb(Vcur, "Vcur", il);
  7269. cur = build_attn(inp_attn,
  7270. model.layers[il].wo, model.layers[il].bo,
  7271. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7272. }
  7273. if (il == n_layer - 1 && inp_out_ids) {
  7274. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7275. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7276. }
  7277. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7278. cb(ffn_inp, "ffn_inp", il);
  7279. // feed-forward network
  7280. cur = build_norm(ffn_inp,
  7281. model.layers[il].ffn_norm, NULL,
  7282. LLM_NORM_RMS, il);
  7283. cb(cur, "ffn_norm", il);
  7284. cur = build_ffn(cur,
  7285. model.layers[il].ffn_up, NULL, NULL,
  7286. model.layers[il].ffn_gate, NULL, NULL,
  7287. model.layers[il].ffn_down, NULL, NULL,
  7288. NULL,
  7289. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7290. cb(cur, "ffn_out", il);
  7291. cur = ggml_add(ctx0, cur, ffn_inp);
  7292. cur = build_cvec(cur, il);
  7293. cb(cur, "l_out", il);
  7294. // input for next layer
  7295. inpL = cur;
  7296. }
  7297. cur = inpL;
  7298. cur = build_norm(cur,
  7299. model.output_norm, NULL,
  7300. LLM_NORM_RMS, -1);
  7301. cb(cur, "result_norm", -1);
  7302. res->t_embd = cur;
  7303. // lm_head
  7304. cur = build_lora_mm(model.output, cur);
  7305. cb(cur, "result_output", -1);
  7306. res->t_logits = cur;
  7307. ggml_build_forward_expand(gf, cur);
  7308. }
  7309. };
  7310. struct llm_build_qwen3moe : public llm_graph_context {
  7311. llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7312. const int64_t n_embd_head = hparams.n_embd_head_v;
  7313. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7314. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7315. ggml_tensor * cur;
  7316. ggml_tensor * inpL;
  7317. inpL = build_inp_embd(model.tok_embd);
  7318. // inp_pos - contains the positions
  7319. ggml_tensor * inp_pos = build_inp_pos();
  7320. auto * inp_attn = build_attn_inp_kv();
  7321. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7322. for (int il = 0; il < n_layer; ++il) {
  7323. ggml_tensor * inpSA = inpL;
  7324. // norm
  7325. cur = build_norm(inpL,
  7326. model.layers[il].attn_norm, NULL,
  7327. LLM_NORM_RMS, il);
  7328. cb(cur, "attn_norm", il);
  7329. // self_attention
  7330. {
  7331. // compute Q and K and RoPE them
  7332. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7333. cb(Qcur, "Qcur", il);
  7334. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7335. cb(Kcur, "Kcur", il);
  7336. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7337. cb(Vcur, "Vcur", il);
  7338. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7339. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7340. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7341. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  7342. cb(Qcur, "Qcur_normed", il);
  7343. Qcur = ggml_rope_ext(
  7344. ctx0, Qcur, inp_pos, nullptr,
  7345. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7346. ext_factor, attn_factor, beta_fast, beta_slow
  7347. );
  7348. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  7349. cb(Kcur, "Kcur_normed", il);
  7350. Kcur = ggml_rope_ext(
  7351. ctx0, Kcur, inp_pos, nullptr,
  7352. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7353. ext_factor, attn_factor, beta_fast, beta_slow
  7354. );
  7355. cb(Qcur, "Qcur", il);
  7356. cb(Kcur, "Kcur", il);
  7357. cb(Vcur, "Vcur", il);
  7358. cur = build_attn(inp_attn,
  7359. model.layers[il].wo, model.layers[il].bo,
  7360. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7361. }
  7362. if (il == n_layer - 1 && inp_out_ids) {
  7363. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7364. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7365. }
  7366. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7367. cb(ffn_inp, "ffn_inp", il);
  7368. // MoE branch
  7369. cur = build_norm(ffn_inp,
  7370. model.layers[il].ffn_norm, NULL,
  7371. LLM_NORM_RMS, il);
  7372. cb(cur, "ffn_norm", il);
  7373. ggml_tensor * moe_out =
  7374. build_moe_ffn(cur,
  7375. model.layers[il].ffn_gate_inp,
  7376. model.layers[il].ffn_up_exps,
  7377. model.layers[il].ffn_gate_exps,
  7378. model.layers[il].ffn_down_exps,
  7379. nullptr,
  7380. n_expert, n_expert_used,
  7381. LLM_FFN_SILU, true,
  7382. false, 0.0,
  7383. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  7384. il);
  7385. cb(moe_out, "ffn_moe_out", il);
  7386. cur = moe_out;
  7387. cur = ggml_add(ctx0, cur, ffn_inp);
  7388. cur = build_cvec(cur, il);
  7389. cb(cur, "l_out", il);
  7390. // input for next layer
  7391. inpL = cur;
  7392. }
  7393. cur = inpL;
  7394. cur = build_norm(cur,
  7395. model.output_norm, NULL,
  7396. LLM_NORM_RMS, -1);
  7397. cb(cur, "result_norm", -1);
  7398. res->t_embd = cur;
  7399. // lm_head
  7400. cur = build_lora_mm(model.output, cur);
  7401. cb(cur, "result_output", -1);
  7402. res->t_logits = cur;
  7403. ggml_build_forward_expand(gf, cur);
  7404. }
  7405. };
  7406. struct llm_build_phi2 : public llm_graph_context {
  7407. llm_build_phi2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7408. const int64_t n_embd_head = hparams.n_embd_head_v;
  7409. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  7410. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7411. ggml_tensor * cur;
  7412. ggml_tensor * attn_norm_output;
  7413. ggml_tensor * ffn_output;
  7414. ggml_tensor * inpL;
  7415. inpL = build_inp_embd(model.tok_embd);
  7416. // inp_pos - contains the positions
  7417. ggml_tensor * inp_pos = build_inp_pos();
  7418. auto * inp_attn = build_attn_inp_kv();
  7419. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7420. for (int il = 0; il < n_layer; ++il) {
  7421. attn_norm_output = build_norm(inpL,
  7422. model.layers[il].attn_norm,
  7423. model.layers[il].attn_norm_b,
  7424. LLM_NORM, il);
  7425. cb(attn_norm_output, "attn_norm", il);
  7426. // self-attention
  7427. {
  7428. ggml_tensor * Qcur = nullptr;
  7429. ggml_tensor * Kcur = nullptr;
  7430. ggml_tensor * Vcur = nullptr;
  7431. if (model.layers[il].wqkv) {
  7432. cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
  7433. cb(cur, "wqkv", il);
  7434. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  7435. cb(cur, "bqkv", il);
  7436. Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  7437. Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  7438. Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  7439. } else {
  7440. Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
  7441. Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
  7442. Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
  7443. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7444. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7445. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7446. }
  7447. Qcur = ggml_rope_ext(
  7448. ctx0, Qcur, inp_pos, nullptr,
  7449. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7450. ext_factor, attn_factor, beta_fast, beta_slow
  7451. );
  7452. Kcur = ggml_rope_ext(
  7453. ctx0, Kcur, inp_pos, nullptr,
  7454. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7455. ext_factor, attn_factor, beta_fast, beta_slow
  7456. );
  7457. cb(Qcur, "Qcur", il);
  7458. cb(Kcur, "Kcur", il);
  7459. cb(Vcur, "Vcur", il);
  7460. // with phi2, we scale the Q to avoid precision issues
  7461. // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
  7462. Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
  7463. cur = build_attn(inp_attn,
  7464. model.layers[il].wo, model.layers[il].bo,
  7465. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
  7466. }
  7467. if (il == n_layer - 1 && inp_out_ids) {
  7468. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7469. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  7470. attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
  7471. }
  7472. // FF
  7473. {
  7474. ffn_output = build_ffn(attn_norm_output,
  7475. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  7476. NULL, NULL, NULL,
  7477. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  7478. NULL,
  7479. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  7480. cb(ffn_output, "ffn_out", il);
  7481. }
  7482. cur = ggml_add(ctx0, cur, ffn_output);
  7483. cur = ggml_add(ctx0, cur, inpL);
  7484. cur = build_cvec(cur, il);
  7485. cb(cur, "l_out", il);
  7486. // input for next layer
  7487. inpL = cur;
  7488. }
  7489. cur = build_norm(inpL,
  7490. model.output_norm,
  7491. model.output_norm_b,
  7492. LLM_NORM, -1);
  7493. cb(cur, "result_norm", -1);
  7494. res->t_embd = cur;
  7495. cur = build_lora_mm(model.output, cur);
  7496. cb(cur, "result_output_no_bias", -1);
  7497. cur = ggml_add(ctx0, cur, model.output_b);
  7498. cb(cur, "result_output", -1);
  7499. res->t_logits = cur;
  7500. ggml_build_forward_expand(gf, cur);
  7501. }
  7502. };
  7503. template<bool iswa>
  7504. struct llm_build_phi3 : public llm_graph_context {
  7505. llm_build_phi3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7506. const int64_t n_embd_head = hparams.n_embd_head_v;
  7507. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  7508. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7509. ggml_tensor * cur;
  7510. ggml_tensor * inpL;
  7511. inpL = build_inp_embd(model.tok_embd);
  7512. // inp_pos - contains the positions
  7513. ggml_tensor * inp_pos = build_inp_pos();
  7514. using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
  7515. inp_attn_type * inp_attn = nullptr;
  7516. if constexpr (iswa) {
  7517. inp_attn = build_attn_inp_kv_iswa();
  7518. } else {
  7519. inp_attn = build_attn_inp_kv();
  7520. }
  7521. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7522. for (int il = 0; il < n_layer; ++il) {
  7523. auto * residual = inpL;
  7524. // self-attention
  7525. {
  7526. // rope freq factors for 128k context
  7527. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  7528. ggml_tensor* attn_norm_output = build_norm(inpL,
  7529. model.layers[il].attn_norm,
  7530. model.layers[il].attn_norm_b,
  7531. LLM_NORM_RMS, il);
  7532. cb(attn_norm_output, "attn_norm", il);
  7533. ggml_tensor * Qcur = nullptr;
  7534. ggml_tensor * Kcur = nullptr;
  7535. ggml_tensor * Vcur = nullptr;
  7536. if (model.layers[il].wqkv) {
  7537. cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
  7538. cb(cur, "wqkv", il);
  7539. Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
  7540. Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
  7541. Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
  7542. } else {
  7543. Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
  7544. Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
  7545. Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
  7546. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7547. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7548. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7549. }
  7550. Qcur = ggml_rope_ext(
  7551. ctx0, Qcur, inp_pos, rope_factors,
  7552. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7553. ext_factor, attn_factor, beta_fast, beta_slow
  7554. );
  7555. Kcur = ggml_rope_ext(
  7556. ctx0, Kcur, inp_pos, rope_factors,
  7557. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7558. ext_factor, attn_factor, beta_fast, beta_slow
  7559. );
  7560. cb(Qcur, "Qcur", il);
  7561. cb(Kcur, "Kcur", il);
  7562. cb(Vcur, "Vcur", il);
  7563. Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
  7564. cb(Qcur, "Qcur", il);
  7565. cur = build_attn(inp_attn,
  7566. model.layers[il].wo, model.layers[il].bo,
  7567. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
  7568. }
  7569. if (il == n_layer - 1 && inp_out_ids) {
  7570. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7571. residual = ggml_get_rows(ctx0, residual, inp_out_ids);
  7572. }
  7573. cur = ggml_add(ctx0, cur, residual);
  7574. residual = cur;
  7575. cur = build_norm(cur,
  7576. model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
  7577. LLM_NORM_RMS, il);
  7578. cb(cur, "ffn_norm", il);
  7579. // feed-forward network
  7580. if (model.layers[il].ffn_gate_inp == nullptr) {
  7581. cur = build_ffn(cur,
  7582. model.layers[il].ffn_up, NULL, NULL,
  7583. NULL, NULL, NULL,
  7584. model.layers[il].ffn_down, NULL, NULL,
  7585. NULL,
  7586. LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
  7587. cb(cur, "ffn_out", il);
  7588. } else {
  7589. // MoE branch
  7590. cur = build_moe_ffn(cur,
  7591. model.layers[il].ffn_gate_inp,
  7592. model.layers[il].ffn_up_exps,
  7593. model.layers[il].ffn_gate_exps,
  7594. model.layers[il].ffn_down_exps,
  7595. nullptr,
  7596. n_expert, n_expert_used,
  7597. LLM_FFN_SILU, true,
  7598. false, 0.0,
  7599. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  7600. il);
  7601. cb(cur, "ffn_moe_out", il);
  7602. }
  7603. cur = ggml_add(ctx0, residual, cur);
  7604. cur = build_cvec(cur, il);
  7605. cb(cur, "l_out", il);
  7606. // input for next layer
  7607. inpL = cur;
  7608. }
  7609. cur = build_norm(inpL,
  7610. model.output_norm,
  7611. model.output_norm_b,
  7612. LLM_NORM_RMS, -1);
  7613. cb(cur, "result_norm", -1);
  7614. res->t_embd = cur;
  7615. cur = build_lora_mm(model.output, cur);
  7616. if (model.output_b != nullptr) {
  7617. cb(cur, "result_output_no_bias", -1);
  7618. cur = ggml_add(ctx0, cur, model.output_b);
  7619. }
  7620. cb(cur, "result_output", -1);
  7621. res->t_logits = cur;
  7622. ggml_build_forward_expand(gf, cur);
  7623. }
  7624. };
  7625. struct llm_build_plamo : public llm_graph_context {
  7626. llm_build_plamo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7627. const int64_t n_embd_head = hparams.n_embd_head_v;
  7628. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7629. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7630. ggml_tensor * cur;
  7631. ggml_tensor * inpL;
  7632. inpL = build_inp_embd(model.tok_embd);
  7633. // inp_pos - contains the positions
  7634. ggml_tensor * inp_pos = build_inp_pos();
  7635. auto * inp_attn = build_attn_inp_kv();
  7636. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7637. for (int il = 0; il < n_layer; ++il) {
  7638. // norm
  7639. cur = build_norm(inpL,
  7640. model.layers[il].attn_norm, NULL,
  7641. LLM_NORM_RMS, il);
  7642. cb(cur, "attn_norm", il);
  7643. ggml_tensor * sa_inp = cur;
  7644. // self-attention
  7645. {
  7646. // compute Q and K and RoPE them
  7647. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7648. cb(Qcur, "Qcur", il);
  7649. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7650. cb(Kcur, "Kcur", il);
  7651. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7652. cb(Vcur, "Vcur", il);
  7653. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7654. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7655. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7656. Qcur = ggml_rope_ext(
  7657. ctx0, Qcur, inp_pos, nullptr,
  7658. n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
  7659. ext_factor, attn_factor, beta_fast, beta_slow
  7660. );
  7661. Kcur = ggml_rope_ext(
  7662. ctx0, Kcur, inp_pos, nullptr,
  7663. n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
  7664. ext_factor, attn_factor, beta_fast, beta_slow
  7665. );
  7666. cb(Qcur, "Qcur", il);
  7667. cb(Kcur, "Kcur", il);
  7668. cb(Vcur, "Vcur", il);
  7669. cur = build_attn(inp_attn,
  7670. model.layers[il].wo, NULL,
  7671. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7672. }
  7673. if (il == n_layer - 1 && inp_out_ids) {
  7674. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7675. sa_inp = ggml_get_rows(ctx0, sa_inp, inp_out_ids);
  7676. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  7677. }
  7678. ggml_tensor * sa_out = cur;
  7679. cur = sa_inp;
  7680. // feed-forward network
  7681. {
  7682. cur = build_ffn(cur,
  7683. model.layers[il].ffn_up, NULL, NULL,
  7684. model.layers[il].ffn_gate, NULL, NULL,
  7685. model.layers[il].ffn_down, NULL, NULL,
  7686. NULL,
  7687. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7688. cb(cur, "ffn_out", il);
  7689. }
  7690. cur = ggml_add(ctx0, cur, sa_out);
  7691. cur = ggml_add(ctx0, cur, inpL);
  7692. cur = build_cvec(cur, il);
  7693. cb(cur, "l_out", il);
  7694. // input for next layer
  7695. inpL = cur;
  7696. }
  7697. cur = inpL;
  7698. cur = build_norm(cur,
  7699. model.output_norm, NULL,
  7700. LLM_NORM_RMS, -1);
  7701. cb(cur, "result_norm", -1);
  7702. res->t_embd = cur;
  7703. // lm_head
  7704. cur = build_lora_mm(model.output, cur);
  7705. cb(cur, "result_output", -1);
  7706. res->t_logits = cur;
  7707. ggml_build_forward_expand(gf, cur);
  7708. }
  7709. };
  7710. struct llm_build_gpt2 : public llm_graph_context {
  7711. llm_build_gpt2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7712. const int64_t n_embd_head = hparams.n_embd_head_v;
  7713. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  7714. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7715. ggml_tensor * cur;
  7716. ggml_tensor * pos;
  7717. ggml_tensor * inpL;
  7718. inpL = build_inp_embd(model.tok_embd);
  7719. // inp_pos - contains the positions
  7720. ggml_tensor * inp_pos = build_inp_pos();
  7721. auto * inp_attn = build_attn_inp_kv();
  7722. pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
  7723. cb(pos, "pos_embd", -1);
  7724. inpL = ggml_add(ctx0, inpL, pos);
  7725. cb(inpL, "inpL", -1);
  7726. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7727. for (int il = 0; il < n_layer; ++il) {
  7728. cur = build_norm(inpL,
  7729. model.layers[il].attn_norm,
  7730. model.layers[il].attn_norm_b,
  7731. LLM_NORM, il);
  7732. cb(cur, "attn_norm", il);
  7733. // self-attention
  7734. {
  7735. cur = build_lora_mm(model.layers[il].wqkv, cur);
  7736. cb(cur, "wqkv", il);
  7737. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  7738. cb(cur, "bqkv", il);
  7739. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  7740. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  7741. ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  7742. cb(Qcur, "Qcur", il);
  7743. cb(Kcur, "Kcur", il);
  7744. cb(Vcur, "Vcur", il);
  7745. cur = build_attn(inp_attn,
  7746. model.layers[il].wo, model.layers[il].bo,
  7747. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7748. }
  7749. if (il == n_layer - 1 && inp_out_ids) {
  7750. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7751. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  7752. }
  7753. // add the input
  7754. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  7755. cb(ffn_inp, "ffn_inp", il);
  7756. // FF
  7757. {
  7758. cur = build_norm(ffn_inp,
  7759. model.layers[il].ffn_norm,
  7760. model.layers[il].ffn_norm_b,
  7761. LLM_NORM, il);
  7762. cb(cur, "ffn_norm", il);
  7763. cur = build_ffn(cur,
  7764. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  7765. NULL, NULL, NULL,
  7766. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  7767. NULL,
  7768. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  7769. cb(cur, "ffn_out", il);
  7770. }
  7771. cur = ggml_add(ctx0, cur, ffn_inp);
  7772. cur = build_cvec(cur, il);
  7773. cb(cur, "l_out", il);
  7774. // input for next layer
  7775. inpL = cur;
  7776. }
  7777. cur = build_norm(inpL,
  7778. model.output_norm,
  7779. model.output_norm_b,
  7780. LLM_NORM, -1);
  7781. cb(cur, "result_norm", -1);
  7782. res->t_embd = cur;
  7783. cur = build_lora_mm(model.output, cur);
  7784. cb(cur, "result_output", -1);
  7785. res->t_logits = cur;
  7786. ggml_build_forward_expand(gf, cur);
  7787. }
  7788. };
  7789. struct llm_build_codeshell : public llm_graph_context {
  7790. llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7791. const int64_t n_embd_head = hparams.n_embd_head_v;
  7792. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  7793. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7794. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7795. ggml_tensor * cur;
  7796. ggml_tensor * inpL;
  7797. inpL = build_inp_embd(model.tok_embd);
  7798. // inp_pos - contains the positions
  7799. ggml_tensor * inp_pos = build_inp_pos();
  7800. auto * inp_attn = build_attn_inp_kv();
  7801. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7802. for (int il = 0; il < n_layer; ++il) {
  7803. cur = build_norm(inpL,
  7804. model.layers[il].attn_norm,
  7805. model.layers[il].attn_norm_b,
  7806. LLM_NORM, il);
  7807. cb(cur, "attn_norm", il);
  7808. // self-attention
  7809. {
  7810. cur = build_lora_mm(model.layers[il].wqkv, cur);
  7811. cb(cur, "wqkv", il);
  7812. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  7813. cb(cur, "bqkv", il);
  7814. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  7815. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  7816. ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  7817. Qcur = ggml_rope_ext(
  7818. ctx0, Qcur, inp_pos, nullptr,
  7819. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7820. ext_factor, attn_factor, beta_fast, beta_slow
  7821. );
  7822. Kcur = ggml_rope_ext(
  7823. ctx0, Kcur, inp_pos, nullptr,
  7824. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7825. ext_factor, attn_factor, beta_fast, beta_slow
  7826. );
  7827. cb(Qcur, "Qcur", il);
  7828. cb(Kcur, "Kcur", il);
  7829. cb(Vcur, "Vcur", il);
  7830. cur = build_attn(inp_attn,
  7831. model.layers[il].wo, model.layers[il].bo,
  7832. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7833. }
  7834. if (il == n_layer - 1 && inp_out_ids) {
  7835. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7836. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  7837. }
  7838. // add the input
  7839. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  7840. cb(ffn_inp, "ffn_inp", il);
  7841. // FF
  7842. {
  7843. cur = build_norm(ffn_inp,
  7844. model.layers[il].ffn_norm,
  7845. model.layers[il].ffn_norm_b,
  7846. LLM_NORM, il);
  7847. cb(cur, "ffn_norm", il);
  7848. cur = build_ffn(cur,
  7849. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  7850. NULL, NULL, NULL,
  7851. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  7852. NULL,
  7853. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  7854. cb(cur, "ffn_out", il);
  7855. }
  7856. cur = ggml_add(ctx0, cur, ffn_inp);
  7857. cur = build_cvec(cur, il);
  7858. cb(cur, "l_out", il);
  7859. // input for next layer
  7860. inpL = cur;
  7861. }
  7862. cur = build_norm(inpL,
  7863. model.output_norm,
  7864. model.output_norm_b,
  7865. LLM_NORM, -1);
  7866. cb(cur, "result_norm", -1);
  7867. res->t_embd = cur;
  7868. cur = build_lora_mm(model.output, cur);
  7869. cb(cur, "result_output", -1);
  7870. res->t_logits = cur;
  7871. ggml_build_forward_expand(gf, cur);
  7872. }
  7873. };
  7874. struct llm_build_orion : public llm_graph_context {
  7875. llm_build_orion(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7876. const int64_t n_embd_head = hparams.n_embd_head_v;
  7877. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7878. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7879. ggml_tensor * cur;
  7880. ggml_tensor * inpL;
  7881. inpL = build_inp_embd(model.tok_embd);
  7882. // inp_pos - contains the positions
  7883. ggml_tensor * inp_pos = build_inp_pos();
  7884. auto * inp_attn = build_attn_inp_kv();
  7885. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7886. for (int il = 0; il < n_layer; ++il) {
  7887. ggml_tensor * inpSA = inpL;
  7888. // norm
  7889. cur = build_norm(inpL,
  7890. model.layers[il].attn_norm, model.layers[il].attn_norm_b,
  7891. LLM_NORM, il);
  7892. cb(cur, "attn_norm", il);
  7893. // self-attention
  7894. {
  7895. // compute Q and K and RoPE them
  7896. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7897. cb(Qcur, "Qcur", il);
  7898. // if (model.layers[il].bq) {
  7899. // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  7900. // cb(Qcur, "Qcur", il);
  7901. // }
  7902. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7903. cb(Kcur, "Kcur", il);
  7904. // if (model.layers[il].bk) {
  7905. // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  7906. // cb(Kcur, "Kcur", il);
  7907. // }
  7908. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7909. cb(Vcur, "Vcur", il);
  7910. // if (model.layers[il].bv) {
  7911. // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  7912. // cb(Vcur, "Vcur", il);
  7913. // }
  7914. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7915. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7916. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7917. Qcur = ggml_rope_ext(
  7918. ctx0, Qcur, inp_pos, nullptr,
  7919. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7920. ext_factor, attn_factor, beta_fast, beta_slow
  7921. );
  7922. Kcur = ggml_rope_ext(
  7923. ctx0, Kcur, inp_pos, nullptr,
  7924. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7925. ext_factor, attn_factor, beta_fast, beta_slow
  7926. );
  7927. cb(Qcur, "Qcur", il);
  7928. cb(Kcur, "Kcur", il);
  7929. cb(Vcur, "Vcur", il);
  7930. cur = build_attn(inp_attn,
  7931. model.layers[il].wo, NULL,
  7932. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7933. }
  7934. if (il == n_layer - 1 && inp_out_ids) {
  7935. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7936. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7937. }
  7938. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7939. cb(ffn_inp, "ffn_inp", il);
  7940. // feed-forward network
  7941. cur = build_norm(ffn_inp,
  7942. model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
  7943. LLM_NORM, il);
  7944. cb(cur, "ffn_norm", il);
  7945. cur = build_ffn(cur,
  7946. model.layers[il].ffn_up, NULL, NULL,
  7947. model.layers[il].ffn_gate, NULL, NULL,
  7948. model.layers[il].ffn_down, NULL, NULL,
  7949. NULL,
  7950. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7951. cb(cur, "ffn_out", il);
  7952. cur = ggml_add(ctx0, cur, ffn_inp);
  7953. cur = build_cvec(cur, il);
  7954. cb(cur, "l_out", il);
  7955. // input for next layer
  7956. inpL = cur;
  7957. }
  7958. cur = inpL;
  7959. cur = build_norm(cur,
  7960. model.output_norm, model.output_norm_b,
  7961. LLM_NORM, -1);
  7962. cb(cur, "result_norm", -1);
  7963. res->t_embd = cur;
  7964. // lm_head
  7965. cur = build_lora_mm(model.output, cur);
  7966. cb(cur, "result_output", -1);
  7967. res->t_logits = cur;
  7968. ggml_build_forward_expand(gf, cur);
  7969. }
  7970. };
  7971. struct llm_build_internlm2 : public llm_graph_context {
  7972. llm_build_internlm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7973. const int64_t n_embd_head = hparams.n_embd_head_v;
  7974. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7975. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7976. ggml_tensor * cur;
  7977. ggml_tensor * inpL;
  7978. inpL = build_inp_embd(model.tok_embd);
  7979. // inp_pos - contains the positions
  7980. ggml_tensor * inp_pos = build_inp_pos();
  7981. auto * inp_attn = build_attn_inp_kv();
  7982. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7983. for (int il = 0; il < n_layer; ++il) {
  7984. ggml_tensor * inpSA = inpL;
  7985. // norm
  7986. cur = build_norm(inpL,
  7987. model.layers[il].attn_norm, NULL,
  7988. LLM_NORM_RMS, il);
  7989. cb(cur, "attn_norm", il);
  7990. // self-attention
  7991. {
  7992. // compute Q and K and RoPE them
  7993. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7994. cb(Qcur, "Qcur", il);
  7995. if (model.layers[il].bq) {
  7996. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  7997. cb(Qcur, "Qcur", il);
  7998. }
  7999. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8000. cb(Kcur, "Kcur", il);
  8001. if (model.layers[il].bk) {
  8002. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  8003. cb(Kcur, "Kcur", il);
  8004. }
  8005. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8006. cb(Vcur, "Vcur", il);
  8007. if (model.layers[il].bv) {
  8008. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  8009. cb(Vcur, "Vcur", il);
  8010. }
  8011. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8012. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8013. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8014. Qcur = ggml_rope_ext(
  8015. ctx0, Qcur, inp_pos, nullptr,
  8016. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8017. ext_factor, attn_factor, beta_fast, beta_slow
  8018. );
  8019. Kcur = ggml_rope_ext(
  8020. ctx0, Kcur, inp_pos, nullptr,
  8021. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8022. ext_factor, attn_factor, beta_fast, beta_slow
  8023. );
  8024. cb(Qcur, "Qcur", il);
  8025. cb(Kcur, "Kcur", il);
  8026. cb(Vcur, "Vcur", il);
  8027. cur = build_attn(inp_attn,
  8028. model.layers[il].wo, model.layers[il].bo,
  8029. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  8030. }
  8031. if (il == n_layer - 1 && inp_out_ids) {
  8032. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8033. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  8034. }
  8035. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  8036. cb(ffn_inp, "ffn_inp", il);
  8037. // feed-forward network
  8038. cur = build_norm(ffn_inp,
  8039. model.layers[il].ffn_norm, NULL,
  8040. LLM_NORM_RMS, il);
  8041. cb(cur, "ffn_norm", il);
  8042. cur = build_ffn(cur,
  8043. model.layers[il].ffn_up, NULL, NULL,
  8044. model.layers[il].ffn_gate, NULL, NULL,
  8045. model.layers[il].ffn_down, NULL, NULL,
  8046. NULL,
  8047. LLM_FFN_SILU, LLM_FFN_PAR, il);
  8048. cb(cur, "ffn_out", il);
  8049. cur = ggml_add(ctx0, cur, ffn_inp);
  8050. cur = build_cvec(cur, il);
  8051. cb(cur, "l_out", il);
  8052. // input for next layer
  8053. inpL = cur;
  8054. }
  8055. cur = inpL;
  8056. cur = build_norm(cur,
  8057. model.output_norm, NULL,
  8058. LLM_NORM_RMS, -1);
  8059. cb(cur, "result_norm", -1);
  8060. res->t_embd = cur;
  8061. // lm_head
  8062. cur = build_lora_mm(model.output, cur);
  8063. cb(cur, "result_output", -1);
  8064. res->t_logits = cur;
  8065. ggml_build_forward_expand(gf, cur);
  8066. }
  8067. };
  8068. struct llm_build_minicpm3 : public llm_graph_context {
  8069. llm_build_minicpm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  8070. //TODO: if the model varies, these parameters need to be read from the model
  8071. const int64_t n_embd_base = 256;
  8072. const float scale_embd = 12.0f;
  8073. const float scale_depth = 1.4f;
  8074. const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
  8075. const uint32_t n_embd_head_qk_rope = hparams.n_rot;
  8076. const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  8077. const uint32_t kv_lora_rank = hparams.n_lora_kv;
  8078. ggml_tensor * cur;
  8079. ggml_tensor * inpL;
  8080. inpL = build_inp_embd(model.tok_embd);
  8081. // scale the input embeddings
  8082. inpL = ggml_scale(ctx0, inpL, scale_embd);
  8083. cb(inpL, "inp_scaled", -1);
  8084. // inp_pos - contains the positions
  8085. ggml_tensor * inp_pos = build_inp_pos();
  8086. auto * inp_attn = build_attn_inp_kv();
  8087. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8088. for (int il = 0; il < n_layer; ++il) {
  8089. ggml_tensor * inpSA = inpL;
  8090. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  8091. // norm
  8092. cur = build_norm(inpL,
  8093. model.layers[il].attn_norm, NULL,
  8094. LLM_NORM_RMS, il);
  8095. cb(cur, "attn_norm", il);
  8096. // self_attention
  8097. {
  8098. ggml_tensor * q = NULL;
  8099. // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
  8100. q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
  8101. cb(q, "q", il);
  8102. q = build_norm(q,
  8103. model.layers[il].attn_q_a_norm, NULL,
  8104. LLM_NORM_RMS, il);
  8105. cb(q, "q", il);
  8106. // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
  8107. q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
  8108. cb(q, "q", il);
  8109. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  8110. ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
  8111. ggml_row_size(q->type, hparams.n_embd_head_k),
  8112. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  8113. 0);
  8114. cb(q_nope, "q_nope", il);
  8115. // and {n_head * n_embd_head_qk_rope, n_tokens}
  8116. ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
  8117. ggml_row_size(q->type, hparams.n_embd_head_k),
  8118. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  8119. ggml_row_size(q->type, n_embd_head_qk_nope));
  8120. cb(q_pe, "q_pe", il);
  8121. // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
  8122. ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
  8123. cb(kv_pe_compresseed, "kv_pe_compresseed", il);
  8124. // split into {kv_lora_rank, n_tokens}
  8125. ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
  8126. kv_pe_compresseed->nb[1],
  8127. 0);
  8128. cb(kv_compressed, "kv_compressed", il);
  8129. // and {n_embd_head_qk_rope, n_tokens}
  8130. ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
  8131. kv_pe_compresseed->nb[1],
  8132. kv_pe_compresseed->nb[1],
  8133. ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
  8134. cb(k_pe, "k_pe", il);
  8135. kv_compressed = build_norm(kv_compressed,
  8136. model.layers[il].attn_kv_a_norm, NULL,
  8137. LLM_NORM_RMS, il);
  8138. cb(kv_compressed, "kv_compressed", il);
  8139. // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
  8140. ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
  8141. cb(kv, "kv", il);
  8142. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  8143. ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
  8144. ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
  8145. ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  8146. 0);
  8147. cb(k_nope, "k_nope", il);
  8148. // and {n_head * n_embd_head_v, n_tokens}
  8149. ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
  8150. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  8151. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
  8152. ggml_row_size(kv->type, (n_embd_head_qk_nope)));
  8153. cb(v_states, "v_states", il);
  8154. v_states = ggml_cont(ctx0, v_states);
  8155. cb(v_states, "v_states", il);
  8156. q_pe = ggml_rope_ext(
  8157. ctx0, q_pe, inp_pos, rope_factors,
  8158. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8159. ext_factor, attn_factor, beta_fast, beta_slow
  8160. );
  8161. cb(q_pe, "q_pe", il);
  8162. // shared RoPE key
  8163. k_pe = ggml_rope_ext(
  8164. ctx0, k_pe, inp_pos, rope_factors,
  8165. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8166. ext_factor, attn_factor, beta_fast, beta_slow
  8167. );
  8168. cb(k_pe, "k_pe", il);
  8169. ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
  8170. cb(q_states, "q_states", il);
  8171. ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
  8172. cb(k_states, "k_states", il);
  8173. cur = build_attn(inp_attn,
  8174. model.layers[il].wo, NULL,
  8175. q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
  8176. }
  8177. if (il == n_layer - 1 && inp_out_ids) {
  8178. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8179. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  8180. }
  8181. // scale_res - scale the hidden states for residual connection
  8182. const float scale_res = scale_depth/sqrtf(float(n_layer)); // TODO: is this correct?
  8183. cur = ggml_scale(ctx0, cur, scale_res);
  8184. cb(cur, "hidden_scaled", il);
  8185. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  8186. cb(ffn_inp, "ffn_inp", il);
  8187. // feed-forward network
  8188. {
  8189. cur = build_norm(ffn_inp,
  8190. model.layers[il].ffn_norm, NULL,
  8191. LLM_NORM_RMS, il);
  8192. cb(cur, "ffn_norm", il);
  8193. cur = build_ffn(cur,
  8194. model.layers[il].ffn_up, NULL, NULL,
  8195. model.layers[il].ffn_gate, NULL, NULL,
  8196. model.layers[il].ffn_down, NULL, NULL,
  8197. NULL,
  8198. LLM_FFN_SILU, LLM_FFN_PAR, il);
  8199. cb(cur, "ffn_out", il);
  8200. }
  8201. // scale the hidden states for residual connection
  8202. cur = ggml_scale(ctx0, cur, scale_res);
  8203. cb(cur, "hidden_scaled_ffn", il);
  8204. cur = ggml_add(ctx0, cur, ffn_inp);
  8205. cur = build_cvec(cur, il);
  8206. cb(cur, "l_out", il);
  8207. // input for next layer
  8208. inpL = cur;
  8209. }
  8210. cur = inpL;
  8211. cur = build_norm(cur,
  8212. model.output_norm, NULL,
  8213. LLM_NORM_RMS, -1);
  8214. cb(cur, "result_norm", -1);
  8215. res->t_embd = cur;
  8216. // lm_head scaling
  8217. const float scale_lmhead = float(n_embd_base)/float(n_embd);
  8218. cur = ggml_scale(ctx0, cur, scale_lmhead);
  8219. cb(cur, "lmhead_scaling", -1);
  8220. // lm_head
  8221. cur = build_lora_mm(model.output, cur);
  8222. cb(cur, "result_output", -1);
  8223. res->t_logits = cur;
  8224. ggml_build_forward_expand(gf, cur);
  8225. }
  8226. };
  8227. struct llm_build_gemma : public llm_graph_context {
  8228. llm_build_gemma(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  8229. const int64_t n_embd_head = hparams.n_embd_head_v;
  8230. ggml_tensor * cur;
  8231. ggml_tensor * inpL;
  8232. inpL = build_inp_embd(model.tok_embd);
  8233. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  8234. cb(inpL, "inp_scaled", -1);
  8235. // inp_pos - contains the positions
  8236. ggml_tensor * inp_pos = build_inp_pos();
  8237. auto * inp_attn = build_attn_inp_kv();
  8238. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8239. for (int il = 0; il < n_layer; ++il) {
  8240. // norm
  8241. cur = build_norm(inpL,
  8242. model.layers[il].attn_norm, NULL,
  8243. LLM_NORM_RMS, il);
  8244. cb(cur, "attn_norm", il);
  8245. // self-attention
  8246. {
  8247. // compute Q and K and RoPE them
  8248. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8249. cb(Qcur, "Qcur", il);
  8250. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8251. cb(Kcur, "Kcur", il);
  8252. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8253. cb(Vcur, "Vcur", il);
  8254. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8255. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8256. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8257. Qcur = ggml_rope_ext(
  8258. ctx0, Qcur, inp_pos, nullptr,
  8259. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8260. ext_factor, attn_factor, beta_fast, beta_slow);
  8261. Kcur = ggml_rope_ext(
  8262. ctx0, Kcur, inp_pos, nullptr,
  8263. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8264. ext_factor, attn_factor, beta_fast, beta_slow);
  8265. cb(Qcur, "Qcur", il);
  8266. cb(Kcur, "Kcur", il);
  8267. cb(Vcur, "Vcur", il);
  8268. Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
  8269. cb(Qcur, "Qcur_scaled", il);
  8270. cur = build_attn(inp_attn,
  8271. model.layers[il].wo, NULL,
  8272. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
  8273. }
  8274. if (il == n_layer - 1 && inp_out_ids) {
  8275. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8276. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  8277. }
  8278. ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
  8279. cb(sa_out, "sa_out", il);
  8280. cur = build_norm(sa_out,
  8281. model.layers[il].ffn_norm, NULL,
  8282. LLM_NORM_RMS, il);
  8283. cb(cur, "ffn_norm", il);
  8284. // feed-forward network
  8285. {
  8286. cur = build_ffn(cur,
  8287. model.layers[il].ffn_up, NULL, NULL,
  8288. model.layers[il].ffn_gate, NULL, NULL,
  8289. model.layers[il].ffn_down, NULL, NULL,
  8290. NULL,
  8291. LLM_FFN_GELU, LLM_FFN_PAR, il);
  8292. cb(cur, "ffn_out", il);
  8293. }
  8294. cur = ggml_add(ctx0, cur, sa_out);
  8295. cur = build_cvec(cur, il);
  8296. cb(cur, "l_out", il);
  8297. // input for next layer
  8298. inpL = cur;
  8299. }
  8300. cur = inpL;
  8301. cur = build_norm(cur,
  8302. model.output_norm, NULL,
  8303. LLM_NORM_RMS, -1);
  8304. cb(cur, "result_norm", -1);
  8305. res->t_embd = cur;
  8306. // lm_head
  8307. cur = build_lora_mm(model.output, cur);
  8308. cb(cur, "result_output", -1);
  8309. res->t_logits = cur;
  8310. ggml_build_forward_expand(gf, cur);
  8311. }
  8312. };
  8313. struct llm_build_gemma2_iswa : public llm_graph_context {
  8314. llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  8315. const int64_t n_embd_head = hparams.n_embd_head_k;
  8316. ggml_tensor * cur;
  8317. ggml_tensor * inpL;
  8318. inpL = build_inp_embd(model.tok_embd);
  8319. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  8320. cb(inpL, "inp_scaled", -1);
  8321. // inp_pos - contains the positions
  8322. ggml_tensor * inp_pos = build_inp_pos();
  8323. auto * inp_attn = build_attn_inp_kv_iswa();
  8324. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8325. for (int il = 0; il < n_layer; ++il) {
  8326. // norm
  8327. cur = build_norm(inpL,
  8328. model.layers[il].attn_norm, NULL,
  8329. LLM_NORM_RMS, il);
  8330. cb(cur, "attn_norm", il);
  8331. // self-attention
  8332. {
  8333. // compute Q and K and RoPE them
  8334. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8335. cb(Qcur, "Qcur", il);
  8336. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8337. cb(Kcur, "Kcur", il);
  8338. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8339. cb(Vcur, "Vcur", il);
  8340. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8341. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8342. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8343. Qcur = ggml_rope_ext(
  8344. ctx0, Qcur, inp_pos, nullptr,
  8345. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8346. ext_factor, attn_factor, beta_fast, beta_slow);
  8347. Kcur = ggml_rope_ext(
  8348. ctx0, Kcur, inp_pos, nullptr,
  8349. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8350. ext_factor, attn_factor, beta_fast, beta_slow);
  8351. cb(Qcur, "Qcur", il);
  8352. cb(Kcur, "Kcur", il);
  8353. cb(Vcur, "Vcur", il);
  8354. Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
  8355. cur = build_attn(inp_attn,
  8356. model.layers[il].wo, NULL,
  8357. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
  8358. }
  8359. if (il == n_layer - 1 && inp_out_ids) {
  8360. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8361. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  8362. }
  8363. cur = build_norm(cur,
  8364. model.layers[il].attn_post_norm, NULL,
  8365. LLM_NORM_RMS, il);
  8366. cb(cur, "attn_post_norm", il);
  8367. ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
  8368. cb(sa_out, "sa_out", il);
  8369. cur = build_norm(sa_out,
  8370. model.layers[il].ffn_norm, NULL,
  8371. LLM_NORM_RMS, il);
  8372. cb(cur, "ffn_norm", il);
  8373. // feed-forward network
  8374. {
  8375. cur = build_ffn(cur,
  8376. model.layers[il].ffn_up, NULL, NULL,
  8377. model.layers[il].ffn_gate, NULL, NULL,
  8378. model.layers[il].ffn_down, NULL, NULL,
  8379. NULL,
  8380. LLM_FFN_GELU, LLM_FFN_PAR, il);
  8381. cb(cur, "ffn_out", il);
  8382. }
  8383. cur = build_norm(cur,
  8384. model.layers[il].ffn_post_norm, NULL,
  8385. LLM_NORM_RMS, -1);
  8386. cb(cur, "ffn_post_norm", -1);
  8387. cur = ggml_add(ctx0, cur, sa_out);
  8388. cur = build_cvec(cur, il);
  8389. cb(cur, "l_out", il);
  8390. // input for next layer
  8391. inpL = cur;
  8392. }
  8393. cur = inpL;
  8394. cur = build_norm(cur,
  8395. model.output_norm, NULL,
  8396. LLM_NORM_RMS, -1);
  8397. cb(cur, "result_norm", -1);
  8398. res->t_embd = cur;
  8399. // lm_head
  8400. cur = build_lora_mm(model.output, cur);
  8401. // final logit soft-capping
  8402. cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
  8403. cur = ggml_tanh(ctx0, cur);
  8404. cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
  8405. cb(cur, "result_output", -1);
  8406. res->t_logits = cur;
  8407. ggml_build_forward_expand(gf, cur);
  8408. }
  8409. };
  8410. struct llm_build_gemma3_iswa : public llm_graph_context {
  8411. llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  8412. const int64_t n_embd_head = hparams.n_embd_head_k;
  8413. ggml_tensor * cur;
  8414. ggml_tensor * inpL;
  8415. inpL = build_inp_embd(model.tok_embd);
  8416. // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
  8417. if (ubatch.token) {
  8418. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  8419. cb(inpL, "inp_scaled", -1);
  8420. }
  8421. // inp_pos - contains the positions
  8422. ggml_tensor * inp_pos = build_inp_pos();
  8423. // TODO: is causal == true correct? might need some changes
  8424. auto * inp_attn = build_attn_inp_kv_iswa();
  8425. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8426. for (int il = 0; il < n_layer; ++il) {
  8427. const float freq_base_l = model.get_rope_freq_base (cparams, il);
  8428. const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
  8429. // norm
  8430. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  8431. cb(cur, "attn_norm", il);
  8432. // self-attention
  8433. {
  8434. // compute Q and K and RoPE them
  8435. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8436. cb(Qcur, "Qcur", il);
  8437. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8438. cb(Kcur, "Kcur", il);
  8439. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8440. cb(Vcur, "Vcur", il);
  8441. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8442. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8443. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8444. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  8445. cb(Qcur, "Qcur_normed", il);
  8446. Qcur = ggml_rope_ext(
  8447. ctx0, Qcur, inp_pos, nullptr,
  8448. n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
  8449. ext_factor, attn_factor, beta_fast, beta_slow);
  8450. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  8451. cb(Kcur, "Kcur_normed", il);
  8452. Kcur = ggml_rope_ext(
  8453. ctx0, Kcur, inp_pos, nullptr,
  8454. n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
  8455. ext_factor, attn_factor, beta_fast, beta_slow);
  8456. cb(Qcur, "Qcur", il);
  8457. cb(Kcur, "Kcur", il);
  8458. cb(Vcur, "Vcur", il);
  8459. // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
  8460. Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
  8461. cur = build_attn(inp_attn,
  8462. model.layers[il].wo, NULL,
  8463. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
  8464. }
  8465. if (il == n_layer - 1 && inp_out_ids) {
  8466. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8467. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  8468. }
  8469. cur = build_norm(cur,
  8470. model.layers[il].attn_post_norm, NULL,
  8471. LLM_NORM_RMS, il);
  8472. cb(cur, "attn_post_norm", il);
  8473. ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
  8474. cb(sa_out, "sa_out", il);
  8475. cur = build_norm(sa_out,
  8476. model.layers[il].ffn_norm, NULL,
  8477. LLM_NORM_RMS, il);
  8478. cb(cur, "ffn_norm", il);
  8479. // feed-forward network
  8480. {
  8481. cur = build_ffn(cur,
  8482. model.layers[il].ffn_up, NULL, NULL,
  8483. model.layers[il].ffn_gate, NULL, NULL,
  8484. model.layers[il].ffn_down, NULL, NULL,
  8485. NULL,
  8486. LLM_FFN_GELU, LLM_FFN_PAR, il);
  8487. cb(cur, "ffn_out", il);
  8488. }
  8489. cur = build_norm(cur,
  8490. model.layers[il].ffn_post_norm, NULL,
  8491. LLM_NORM_RMS, -1);
  8492. cb(cur, "ffn_post_norm", -1);
  8493. cur = ggml_add(ctx0, cur, sa_out);
  8494. cur = build_cvec(cur, il);
  8495. cb(cur, "l_out", il);
  8496. // input for next layer
  8497. inpL = cur;
  8498. }
  8499. cur = inpL;
  8500. cur = build_norm(cur,
  8501. model.output_norm, NULL,
  8502. LLM_NORM_RMS, -1);
  8503. cb(cur, "result_norm", -1);
  8504. res->t_embd = cur;
  8505. // lm_head
  8506. cur = build_lora_mm(model.output, cur);
  8507. cb(cur, "result_output", -1);
  8508. res->t_logits = cur;
  8509. ggml_build_forward_expand(gf, cur);
  8510. }
  8511. };
  8512. struct llm_build_gemma3n_iswa : public llm_graph_context {
  8513. const llama_model & model;
  8514. const int64_t n_embd_head;
  8515. const int64_t n_embd_altup;
  8516. const int64_t n_altup;
  8517. const int i_altup_act;
  8518. const int n_layer_sparsity = 10; // number of layers using activation sparsity
  8519. const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
  8520. llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params)
  8521. : llm_graph_context(params),
  8522. model(model),
  8523. n_embd_head(model.hparams.n_embd_head_k),
  8524. n_embd_altup(model.hparams.n_embd_altup),
  8525. n_altup(model.hparams.n_altup),
  8526. i_altup_act(model.hparams.i_altup_act) {
  8527. ggml_tensor * cur;
  8528. ggml_tensor * inpL;
  8529. inpL = build_inp_embd(model.tok_embd);
  8530. // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
  8531. if (ubatch.token) {
  8532. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  8533. cb(inpL, "inp_scaled", -1);
  8534. }
  8535. // inp_pos - contains the positions
  8536. ggml_tensor * inp_pos = build_inp_pos();
  8537. // TODO: is causal == true correct? might need some changes
  8538. auto * inp_attn = build_attn_inp_kv_iswa();
  8539. // inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
  8540. ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
  8541. // inpL now has only 1 altup, project it to the rest of the altups
  8542. // these "added" altups will be concat to the last dim of inpL
  8543. {
  8544. ggml_tensor * target_magnitude = calc_magnitude(inpL);
  8545. ggml_tensor * inp_repeated = ggml_repeat_4d(ctx0, inpL, n_embd, n_tokens, n_altup - 1, 1);
  8546. ggml_tensor * altup_added = ggml_mul_mat(ctx0, model.altup_proj, inp_repeated); // shape: [n_embd, n_tokens, n_altup - 1]
  8547. ggml_tensor * new_magnitude = calc_magnitude(altup_added);
  8548. altup_added = ggml_div(ctx0,
  8549. ggml_mul(ctx0, altup_added, target_magnitude),
  8550. new_magnitude);
  8551. inpL = ggml_concat(ctx0, inpL, altup_added, 2); // shape: [n_embd, n_tokens, n_altup]
  8552. cb(inpL, "inp_stacked", -1);
  8553. }
  8554. // inpL now has shape: [n_embd, n_tokens, n_altup]
  8555. // inp_per_layer now has shape: [n_embd_altup, n_tokens, n_layer]
  8556. for (int il = 0; il < n_layer; ++il) {
  8557. // this block is made to be closely resemble Gemma3p5DecoderLayer on python code
  8558. const float freq_base_l = model.get_rope_freq_base (cparams, il);
  8559. const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
  8560. ggml_tensor * cur = inpL; // [n_embd, n_tokens, n_altup]
  8561. ggml_tensor * predictions = altup_predict(cur, il); // [n_embd, n_tokens, n_altup]
  8562. // predicted value will go through self-attention and laurel
  8563. ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act); // [n_embd, n_tokens]
  8564. cur = active_prediction;
  8565. cb(cur, "active_prediction", il);
  8566. // norm
  8567. cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  8568. cb(cur, "attn_norm", il);
  8569. // laurel
  8570. ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens]
  8571. // self-attention
  8572. if (hparams.has_kv(il)) {
  8573. // compute Q and K and RoPE them
  8574. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8575. cb(Qcur, "Qcur", il);
  8576. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8577. cb(Kcur, "Kcur", il);
  8578. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8579. cb(Vcur, "Vcur", il);
  8580. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8581. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8582. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8583. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  8584. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  8585. Vcur = ggml_rms_norm(ctx0, Vcur, hparams.f_norm_rms_eps);
  8586. cb(Qcur, "Qcur_normed", il);
  8587. cb(Kcur, "Kcur_normed", il);
  8588. cb(Vcur, "Vcur_normed", il);
  8589. Qcur = ggml_rope_ext(
  8590. ctx0, Qcur, inp_pos, nullptr,
  8591. n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
  8592. ext_factor, attn_factor, beta_fast, beta_slow);
  8593. Kcur = ggml_rope_ext(
  8594. ctx0, Kcur, inp_pos, nullptr,
  8595. n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
  8596. ext_factor, attn_factor, beta_fast, beta_slow);
  8597. cb(Qcur, "Qcur_pos", il);
  8598. cb(Kcur, "Kcur_pos", il);
  8599. cur = build_attn(inp_attn,
  8600. model.layers[il].wo, NULL,
  8601. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
  8602. } else {
  8603. // reuse KV cache of earlier layers
  8604. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8605. cb(Qcur, "Qcur", il);
  8606. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8607. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  8608. cb(Qcur, "Qcur_normed", il);
  8609. Qcur = ggml_rope_ext(
  8610. ctx0, Qcur, inp_pos, nullptr,
  8611. n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
  8612. ext_factor, attn_factor, beta_fast, beta_slow);
  8613. cb(Qcur, "Qcur_pos", il);
  8614. cur = build_attn(inp_attn,
  8615. model.layers[il].wo, NULL,
  8616. Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
  8617. }
  8618. cur = build_norm(cur,
  8619. model.layers[il].attn_post_norm, NULL,
  8620. LLM_NORM_RMS, il);
  8621. cb(cur, "attn_post_norm", il);
  8622. cur = ggml_add(ctx0, cur, active_prediction); // [n_embd, n_tokens]
  8623. cb(cur, "attn_gated", il);
  8624. ggml_tensor * attn_laurel = ggml_scale(ctx0,
  8625. ggml_add(ctx0, cur, laurel_out),
  8626. 1.0f / sqrtf(2.0f)); // [n_embd, n_tokens]
  8627. cb(attn_laurel, "attn_laurel", il);
  8628. cur = build_norm(attn_laurel,
  8629. model.layers[il].ffn_norm, NULL,
  8630. LLM_NORM_RMS, il);
  8631. cb(cur, "ffn_norm", il);
  8632. // feed-forward network
  8633. {
  8634. ggml_tensor * up_proj = build_lora_mm(model.layers[il].ffn_up, cur);
  8635. ggml_tensor * gate_proj = build_lora_mm(model.layers[il].ffn_gate, cur);
  8636. if (il < n_layer_sparsity) {
  8637. // apply activation sparsity
  8638. gate_proj = gaussian_topk(gate_proj);
  8639. }
  8640. gate_proj = ggml_gelu(ctx0, gate_proj);
  8641. cur = ggml_mul(ctx0, up_proj, gate_proj);
  8642. cur = build_lora_mm(model.layers[il].ffn_down, cur);
  8643. cb(cur, "ffn_out", il);
  8644. }
  8645. cur = build_norm(cur,
  8646. model.layers[il].ffn_post_norm, NULL,
  8647. LLM_NORM_RMS, -1);
  8648. cb(cur, "ffn_post_norm", il);
  8649. ggml_tensor * attn_ffw_laurel_gated = ggml_add(ctx0, cur, attn_laurel); // [n_embd, n_tokens]
  8650. cb(attn_ffw_laurel_gated, "attn_ffw_laurel_gated", il);
  8651. ggml_tensor * corrected = altup_correct(predictions, attn_ffw_laurel_gated, il); // [n_embd, n_tokens, n_altup]
  8652. ggml_tensor * first_prediction; // [n_embd, n_tokens]
  8653. {
  8654. first_prediction = view_2d_slice(corrected, i_altup_act); // [n_embd, n_tokens]
  8655. first_prediction = ggml_mul(ctx0, first_prediction, model.layers[il].altup_correct_scale);
  8656. first_prediction = build_lora_mm(model.layers[il].per_layer_inp_gate, first_prediction);
  8657. first_prediction = ggml_gelu(ctx0, first_prediction); // [n_embd_altup, n_tokens]
  8658. cb(first_prediction, "first_prediction_gated", il);
  8659. ggml_tensor * inp_this_layer = view_2d_slice(inp_per_layer, il); // [n_embd_altup, n_tokens]
  8660. first_prediction = ggml_mul(ctx0, first_prediction, inp_this_layer); // [n_embd_altup, n_tokens]
  8661. cb(first_prediction, "first_prediction_scaled", il);
  8662. first_prediction = build_lora_mm(model.layers[il].per_layer_proj, first_prediction); // [n_embd, n_tokens]
  8663. first_prediction = build_norm(first_prediction,
  8664. model.layers[il].per_layer_post_norm, NULL,
  8665. LLM_NORM_RMS, il);
  8666. cb(first_prediction, "first_prediction_out", il);
  8667. }
  8668. // equivalent to python code: corrected_predictions[1:] += first_prediction
  8669. {
  8670. ggml_tensor * slice_first = view_2d_slice(corrected, 0);
  8671. ggml_tensor * slice_rest = ggml_view_3d(ctx0, corrected, n_embd, n_tokens, n_altup - 1,
  8672. ggml_row_size(corrected->type, n_embd),
  8673. ggml_row_size(corrected->type, n_embd*n_tokens),
  8674. n_embd*n_tokens*ggml_element_size(corrected));
  8675. ggml_tensor * tmp = ggml_add(ctx0, slice_rest, first_prediction); // [n_embd, n_tokens, n_altup - 1]
  8676. corrected = ggml_concat(ctx0, slice_first, tmp, 2); // [n_embd, n_tokens, n_altup]
  8677. }
  8678. cur = corrected; // [n_embd, n_tokens, n_altup]
  8679. cur = build_cvec(cur, il);
  8680. cb(cur, "l_out", il);
  8681. // input for next layer
  8682. inpL = cur;
  8683. }
  8684. cur = inpL; // [n_embd, n_tokens, n_altup]
  8685. // cur now has multiple altup(s), we want to merge them back to 1 altup
  8686. {
  8687. ggml_tensor * target_magnitude = calc_magnitude(view_2d_slice(cur, i_altup_act)); // [n_embd, n_tokens]
  8688. // do a view to skip the first slice (active altup)
  8689. ggml_tensor * alt_slice = ggml_view_3d(ctx0, cur, n_embd, n_tokens, n_altup - 1,
  8690. ggml_row_size(cur->type, n_embd),
  8691. ggml_row_size(cur->type, n_embd*n_tokens),
  8692. n_embd*n_tokens*ggml_element_size(cur));
  8693. ggml_tensor * altup_unembd = ggml_mul_mat(ctx0, model.altup_unembd_proj, alt_slice); // shape: [n_embd, n_tokens, n_altup - 1]
  8694. ggml_tensor * new_magnitude = calc_magnitude(altup_unembd);
  8695. altup_unembd = ggml_div(ctx0,
  8696. ggml_mul(ctx0, altup_unembd, target_magnitude),
  8697. new_magnitude);
  8698. cb(altup_unembd, "altup_unembd", -1);
  8699. // equivalent to torch.mean(hidden_states, dim=0)
  8700. cur = view_2d_slice(cur, 0); // [n_embd, n_tokens]
  8701. for (int i = 0; i < n_altup - 1; ++i) {
  8702. cur = ggml_add(ctx0, cur, view_2d_slice(altup_unembd, i));
  8703. }
  8704. cur = ggml_scale(ctx0, cur, 1.0f / float(n_altup)); // [n_embd, n_tokens]
  8705. cb(cur, "unembd_merged", -1);
  8706. }
  8707. // cur now has shape: [n_embd, n_tokens]
  8708. // TODO: move this to right after the last KV layer
  8709. {
  8710. // skip computing output for unused tokens
  8711. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8712. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8713. }
  8714. cur = build_norm(cur,
  8715. model.output_norm, NULL,
  8716. LLM_NORM_RMS, -1);
  8717. cb(cur, "result_norm", -1);
  8718. res->t_embd = cur;
  8719. cur = build_lora_mm(model.output, cur);
  8720. {
  8721. // final logit soft-capping
  8722. cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
  8723. cur = ggml_tanh(ctx0, cur);
  8724. cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
  8725. }
  8726. cb(cur, "result_output", -1);
  8727. res->t_logits = cur;
  8728. ggml_build_forward_expand(gf, cur);
  8729. }
  8730. ggml_tensor * calc_magnitude(ggml_tensor * x) {
  8731. return ggml_sqrt(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, x)));
  8732. }
  8733. // get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim
  8734. ggml_tensor * view_2d_slice(ggml_tensor * x, int idx) {
  8735. GGML_ASSERT(idx < (int)x->ne[2]);
  8736. return ggml_view_2d(ctx0, x, x->ne[0], x->ne[1],
  8737. ggml_row_size(x->type, x->ne[0]),
  8738. idx * x->ne[0] * x->ne[1] * ggml_element_size(x));
  8739. }
  8740. // equivalent to get_per_layer_inputs() in python code
  8741. // output shape: [n_embd_altup, n_layer, n_tokens]
  8742. ggml_tensor * get_per_layer_inputs() {
  8743. auto inp = std::make_unique<llm_graph_input_embd>();
  8744. ggml_tensor * inp_per_layer;
  8745. if (ubatch.token) {
  8746. inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
  8747. ggml_set_input(inp->tokens);
  8748. res->t_tokens = inp->tokens;
  8749. inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
  8750. inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
  8751. inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float)n_embd_altup));
  8752. cb(inp_per_layer, "inp_per_layer_selected", -1);
  8753. } else {
  8754. GGML_ABORT("TODO: support embd input");
  8755. }
  8756. res->add_input(std::move(inp));
  8757. return inp_per_layer;
  8758. }
  8759. // equivalent to project_per_layer_inputs() in python code
  8760. // this calculates the per-layer inputs, so the final tensor shape will have n_layer as the last dim
  8761. // output shape: [n_embd_altup, n_tokens, n_layer]
  8762. ggml_tensor * project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer) {
  8763. const float per_layer_projection_scale = 1.0f / sqrtf((float)n_embd);
  8764. const float per_layer_input_scale = 1.0f / sqrtf(2.0f);
  8765. ggml_tensor * per_layer_proj = ggml_mul_mat(ctx0, model.per_layer_model_proj, inputs_embeds);
  8766. per_layer_proj = ggml_scale(ctx0, per_layer_proj, per_layer_projection_scale);
  8767. per_layer_proj = ggml_reshape_3d(ctx0, per_layer_proj, n_embd_altup, n_layer, n_tokens);
  8768. per_layer_proj = build_norm(per_layer_proj,
  8769. model.per_layer_proj_norm, NULL,
  8770. LLM_NORM_RMS, -1); // [n_embd_altup, n_layer, n_tokens]
  8771. cb(per_layer_proj, "per_layer_proj", -1);
  8772. inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
  8773. inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
  8774. cb(inp_per_layer, "inp_per_layer", -1);
  8775. // permute to shape: [n_embd_altup, n_tokens, n_layer]
  8776. inp_per_layer = ggml_cont(ctx0, ggml_permute(ctx0, inp_per_layer, 0, 2, 1, 3));
  8777. return inp_per_layer;
  8778. }
  8779. // input cur shape: [n_altup, n_tokens]
  8780. // output shape: [n_altup, n_tokens]
  8781. ggml_tensor * laurel(ggml_tensor * cur, int il) {
  8782. ggml_tensor * tmp = cur;
  8783. tmp = build_lora_mm(model.layers[il].laurel_l, tmp);
  8784. tmp = build_lora_mm(model.layers[il].laurel_r, tmp);
  8785. tmp = build_norm(tmp, model.layers[il].laurel_post_norm, NULL, LLM_NORM_RMS, il);
  8786. tmp = ggml_add(ctx0, tmp, cur);
  8787. cb(tmp, "laurel_out", il);
  8788. return tmp;
  8789. }
  8790. // input x shape: [n_embd, n_tokens]
  8791. // output shape: [n_embd, n_tokens]
  8792. ggml_tensor * gaussian_topk(ggml_tensor * x) {
  8793. ggml_tensor * mean = ggml_mean(ctx0, x);
  8794. ggml_tensor * std = ggml_sqrt(ctx0, ggml_scale(ctx0,
  8795. ggml_sum_rows(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x, mean))),
  8796. 1.0f / (float)(x->ne[0] - 1)
  8797. ));
  8798. ggml_tensor * cutoff_x = ggml_add(ctx0, mean, ggml_scale(ctx0, std, f_sparsity_std_mul));
  8799. return ggml_relu(ctx0, ggml_sub(ctx0, x, cutoff_x));
  8800. }
  8801. //
  8802. // altup functions
  8803. //
  8804. // equivalent to compute_router_modalities() in python code
  8805. // input x shape: [n_embd, n_tokens]
  8806. // output shape: [n_altup, n_tokens]
  8807. ggml_tensor * altup_compute_router_modalities(ggml_tensor * x, int il) {
  8808. ggml_tensor * router_inputs = build_norm(x,
  8809. model.layers[il].altup_router_norm, NULL,
  8810. LLM_NORM_RMS, il);
  8811. // router_input_scale
  8812. router_inputs = ggml_scale(ctx0, router_inputs, 1.0f / (float)n_embd);
  8813. ggml_tensor * output = ggml_mul_mat(ctx0, model.layers[il].altup_router, router_inputs);
  8814. return ggml_tanh(ctx0, output); // [n_altup, n_tokens]
  8815. }
  8816. // input cur shape: [n_embd, n_tokens, n_altup]
  8817. // output shape: [n_embd, n_tokens, n_altup]
  8818. ggml_tensor * altup_predict(ggml_tensor * cur, int il) {
  8819. ggml_tensor * activated = view_2d_slice(cur, i_altup_act); // [n_embd, n_tokens]
  8820. ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
  8821. cb(modalities, "modalities", il);
  8822. ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_predict_coef, modalities);
  8823. cb(all_coefs, "all_coefs", il);
  8824. // first dim now having n_altup^2 elements, we reshape it to 2D (so we end up with 3D tensor)
  8825. all_coefs = ggml_reshape_3d(ctx0, all_coefs, n_altup, n_altup, n_tokens);
  8826. // permute to [n_altup, n_embd, n_tokens]
  8827. ggml_tensor * cur_permuted = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
  8828. ggml_tensor * predictions = ggml_mul_mat(ctx0, cur_permuted, all_coefs); // [n_altup, n_embd, n_tokens]
  8829. // final shape must be the same as cur: [n_embd, n_tokens, n_altup]
  8830. predictions = ggml_cont(ctx0, ggml_permute(ctx0, predictions, 0, 2, 1, 3));
  8831. predictions = ggml_add(ctx0, predictions, cur);
  8832. cb(predictions, "predictions", il);
  8833. return predictions;
  8834. }
  8835. // input predictions shape: [n_embd, n_tokens, n_altup]
  8836. // input activated shape: [n_embd, n_tokens]
  8837. // output shape: [n_embd, n_tokens, n_altup]
  8838. ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il) {
  8839. ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
  8840. cb(modalities, "modalities", il);
  8841. ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act);
  8842. ggml_tensor * innovation = ggml_sub(ctx0, activated, active_prediction); // [n_embd, n_tokens]
  8843. cb(innovation, "innovation", il);
  8844. ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
  8845. all_coefs = ggml_scale_bias(ctx0, all_coefs, 1.0f, 1.0f); // + 1.0
  8846. cb(all_coefs, "all_coefs", il);
  8847. all_coefs = ggml_transpose(ctx0, all_coefs); // [n_tokens, n_altup]
  8848. all_coefs = ggml_cont_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
  8849. innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1);
  8850. ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup]
  8851. corrected = ggml_add(ctx0, corrected, predictions); // [n_embd, n_tokens, n_altup]
  8852. cb(corrected, "corrected", il);
  8853. return corrected;
  8854. }
  8855. };
  8856. struct llm_build_gemma_embedding_iswa : public llm_graph_context {
  8857. llm_build_gemma_embedding_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  8858. const int64_t n_embd_head = hparams.n_embd_head_k;
  8859. ggml_tensor * cur;
  8860. ggml_tensor * inpL;
  8861. inpL = build_inp_embd(model.tok_embd);
  8862. // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
  8863. if (ubatch.token) {
  8864. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  8865. cb(inpL, "inp_scaled", -1);
  8866. }
  8867. // inp_pos - contains the positions
  8868. ggml_tensor * inp_pos = build_inp_pos();
  8869. // TODO: support cacheless iSWA embeddings [TAG_NO_CACHE_ISWA]
  8870. auto * inp_attn = build_attn_inp_kv_iswa();
  8871. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8872. for (int il = 0; il < n_layer; ++il) {
  8873. const float freq_base_l = model.get_rope_freq_base (cparams, il);
  8874. const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
  8875. // norm
  8876. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  8877. cb(cur, "attn_norm", il);
  8878. // self-attention
  8879. {
  8880. // compute Q and K and RoPE them
  8881. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8882. cb(Qcur, "Qcur", il);
  8883. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8884. cb(Kcur, "Kcur", il);
  8885. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8886. cb(Vcur, "Vcur", il);
  8887. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8888. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8889. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8890. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  8891. cb(Qcur, "Qcur_normed", il);
  8892. Qcur = ggml_rope_ext(
  8893. ctx0, Qcur, inp_pos, nullptr,
  8894. n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
  8895. ext_factor, attn_factor, beta_fast, beta_slow);
  8896. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  8897. cb(Kcur, "Kcur_normed", il);
  8898. Kcur = ggml_rope_ext(
  8899. ctx0, Kcur, inp_pos, nullptr,
  8900. n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
  8901. ext_factor, attn_factor, beta_fast, beta_slow);
  8902. cb(Qcur, "Qcur", il);
  8903. cb(Kcur, "Kcur", il);
  8904. cb(Vcur, "Vcur", il);
  8905. // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
  8906. Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
  8907. cur = build_attn(inp_attn,
  8908. model.layers[il].wo, NULL,
  8909. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
  8910. }
  8911. if (il == n_layer - 1 && inp_out_ids) {
  8912. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8913. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  8914. }
  8915. cur = build_norm(cur,
  8916. model.layers[il].attn_post_norm, NULL,
  8917. LLM_NORM_RMS, il);
  8918. cb(cur, "attn_post_norm", il);
  8919. ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
  8920. cb(sa_out, "sa_out", il);
  8921. cur = build_norm(sa_out,
  8922. model.layers[il].ffn_norm, NULL,
  8923. LLM_NORM_RMS, il);
  8924. cb(cur, "ffn_norm", il);
  8925. // feed-forward network
  8926. {
  8927. cur = build_ffn(cur,
  8928. model.layers[il].ffn_up, NULL, NULL,
  8929. model.layers[il].ffn_gate, NULL, NULL,
  8930. model.layers[il].ffn_down, NULL, NULL,
  8931. NULL,
  8932. LLM_FFN_GELU, LLM_FFN_PAR, il);
  8933. cb(cur, "ffn_out", il);
  8934. }
  8935. cur = build_norm(cur,
  8936. model.layers[il].ffn_post_norm, NULL,
  8937. LLM_NORM_RMS, -1);
  8938. cb(cur, "ffn_post_norm", -1);
  8939. cur = ggml_add(ctx0, cur, sa_out);
  8940. cur = build_cvec(cur, il);
  8941. cb(cur, "l_out", il);
  8942. // input for next layer
  8943. inpL = cur;
  8944. }
  8945. cur = inpL;
  8946. cur = build_norm(cur,
  8947. model.output_norm, NULL,
  8948. LLM_NORM_RMS, -1);
  8949. cb(cur, "result_norm", -1);
  8950. res->t_embd = cur;
  8951. ggml_build_forward_expand(gf, cur);
  8952. }
  8953. };
  8954. // TODO: move up next to build_starcoder
  8955. struct llm_build_starcoder2 : public llm_graph_context {
  8956. llm_build_starcoder2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  8957. const int64_t n_embd_head = hparams.n_embd_head_v;
  8958. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8959. GGML_ASSERT(n_embd_head == hparams.n_rot);
  8960. ggml_tensor * cur;
  8961. ggml_tensor * inpL;
  8962. inpL = build_inp_embd(model.tok_embd);
  8963. // inp_pos - contains the positions
  8964. ggml_tensor * inp_pos = build_inp_pos();
  8965. auto * inp_attn = build_attn_inp_kv();
  8966. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8967. for (int il = 0; il < n_layer; ++il) {
  8968. ggml_tensor * inpSA = inpL;
  8969. // norm
  8970. cur = build_norm(inpL,
  8971. model.layers[il].attn_norm, model.layers[il].attn_norm_b,
  8972. LLM_NORM, il);
  8973. cb(cur, "attn_norm", il);
  8974. // self-attention
  8975. {
  8976. // compute Q and K and RoPE them
  8977. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8978. cb(Qcur, "Qcur", il);
  8979. if (model.layers[il].bq) {
  8980. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  8981. cb(Qcur, "Qcur", il);
  8982. }
  8983. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8984. cb(Kcur, "Kcur", il);
  8985. if (model.layers[il].bk) {
  8986. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  8987. cb(Kcur, "Kcur", il);
  8988. }
  8989. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8990. cb(Vcur, "Vcur", il);
  8991. if (model.layers[il].bv) {
  8992. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  8993. cb(Vcur, "Vcur", il);
  8994. }
  8995. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8996. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8997. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8998. Qcur = ggml_rope_ext(
  8999. ctx0, Qcur, inp_pos, nullptr,
  9000. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9001. ext_factor, attn_factor, beta_fast, beta_slow
  9002. );
  9003. Kcur = ggml_rope_ext(
  9004. ctx0, Kcur, inp_pos, nullptr,
  9005. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9006. ext_factor, attn_factor, beta_fast, beta_slow
  9007. );
  9008. cb(Qcur, "Qcur", il);
  9009. cb(Kcur, "Kcur", il);
  9010. cb(Vcur, "Vcur", il);
  9011. cur = build_attn(inp_attn,
  9012. model.layers[il].wo, model.layers[il].bo,
  9013. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  9014. }
  9015. if (il == n_layer - 1 && inp_out_ids) {
  9016. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9017. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9018. }
  9019. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9020. cb(ffn_inp, "ffn_inp", il);
  9021. // feed-forward network
  9022. cur = build_norm(ffn_inp,
  9023. model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
  9024. LLM_NORM, il);
  9025. cb(cur, "ffn_norm", il);
  9026. cur = build_ffn(cur,
  9027. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  9028. NULL, NULL, NULL,
  9029. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  9030. NULL,
  9031. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  9032. cb(cur, "ffn_out", il);
  9033. cur = ggml_add(ctx0, cur, ffn_inp);
  9034. cur = build_cvec(cur, il);
  9035. cb(cur, "l_out", il);
  9036. // input for next layer
  9037. inpL = cur;
  9038. }
  9039. cur = inpL;
  9040. cur = build_norm(cur,
  9041. model.output_norm, model.output_norm_b,
  9042. LLM_NORM, -1);
  9043. cb(cur, "result_norm", -1);
  9044. res->t_embd = cur;
  9045. // lm_head
  9046. cur = build_lora_mm(model.output, cur);
  9047. cb(cur, "result_output", -1);
  9048. res->t_logits = cur;
  9049. ggml_build_forward_expand(gf, cur);
  9050. }
  9051. };
  9052. struct llm_graph_context_mamba : public llm_graph_context {
  9053. llm_graph_context_mamba(const llm_graph_params & params) : llm_graph_context(params) {}
  9054. ggml_tensor * build_mamba_layer(
  9055. llm_graph_input_rs * inp,
  9056. ggml_tensor * cur,
  9057. const llama_model & model,
  9058. const llama_ubatch & ubatch,
  9059. int il) {
  9060. const auto * mctx_cur = inp->mctx;
  9061. const auto kv_head = mctx_cur->get_head();
  9062. const auto & layer = model.layers[il];
  9063. const int64_t d_conv = hparams.ssm_d_conv;
  9064. const int64_t d_inner = hparams.ssm_d_inner;
  9065. const int64_t d_state = hparams.ssm_d_state;
  9066. const int64_t dt_rank = hparams.ssm_dt_rank;
  9067. const int64_t n_head = d_inner;
  9068. const int64_t head_dim = 1;
  9069. const int64_t n_seqs = ubatch.n_seqs;
  9070. // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
  9071. const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
  9072. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  9073. GGML_ASSERT(n_seqs != 0);
  9074. GGML_ASSERT(ubatch.equal_seqs());
  9075. GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
  9076. ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
  9077. ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
  9078. ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
  9079. conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
  9080. // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
  9081. cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
  9082. // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
  9083. ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur);
  9084. // split the above in two
  9085. // => {d_inner, n_seq_tokens, n_seqs}
  9086. ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
  9087. ggml_tensor * z = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz));
  9088. // conv
  9089. {
  9090. // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
  9091. ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
  9092. // copy last (d_conv - 1) columns back into the state cache
  9093. ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
  9094. ggml_build_forward_expand(gf,
  9095. ggml_cpy(ctx0, last_conv,
  9096. ggml_view_1d(ctx0, conv_states_all,
  9097. (d_conv - 1)*(d_inner)*(n_seqs),
  9098. kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
  9099. // 1D convolution
  9100. // The equivalent is to make a self-overlapping view of conv_x
  9101. // over d_conv columns at each stride in the 3rd dimension,
  9102. // then element-wise multiply that with the conv1d weight,
  9103. // then sum the elements of each row,
  9104. // (the last two steps are a dot product over rows (also doable with mul_mat))
  9105. // then permute away the ne[0] dimension,
  9106. // and then you're left with the resulting x tensor.
  9107. // For simultaneous sequences, all sequences need to have the same length.
  9108. x = ggml_ssm_conv(ctx0, conv_x, layer.ssm_conv1d);
  9109. // bias
  9110. x = ggml_add(ctx0, x, layer.ssm_conv1d_b);
  9111. x = ggml_silu(ctx0, x);
  9112. }
  9113. // ssm
  9114. {
  9115. // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
  9116. ggml_tensor * x_db = build_lora_mm(layer.ssm_x, x);
  9117. // split
  9118. ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
  9119. ggml_tensor * B = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
  9120. ggml_tensor * C = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
  9121. // Some Mamba variants (e.g. FalconMamba, Jamba) apply RMS norm in B, C & Dt layers
  9122. if (ssm_dt_b_c_rms || (layer.ssm_dt_norm && layer.ssm_b_norm && layer.ssm_c_norm)) {
  9123. dt = build_norm(dt, layer.ssm_dt_norm, NULL, LLM_NORM_RMS, il);
  9124. B = build_norm(B, layer.ssm_b_norm, NULL, LLM_NORM_RMS, il);
  9125. C = build_norm(C, layer.ssm_c_norm, NULL, LLM_NORM_RMS, il);
  9126. }
  9127. // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
  9128. dt = build_lora_mm(layer.ssm_dt, dt);
  9129. dt = ggml_add(ctx0, dt, layer.ssm_dt_b);
  9130. cur = x;
  9131. x = ggml_reshape_4d(ctx0, x, head_dim, n_head, n_seq_tokens, n_seqs);
  9132. ggml_tensor * A = layer.ssm_a;
  9133. // use the states and the indices provided by build_recurrent_state
  9134. // (this is necessary in order to properly use the states before they are overwritten,
  9135. // while avoiding to make unnecessary copies of the states)
  9136. auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
  9137. ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
  9138. // Custom operator to optimize the parallel associative scan
  9139. // as described in the Annex D of the Mamba paper.
  9140. // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
  9141. return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
  9142. };
  9143. ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
  9144. // store last states
  9145. ggml_build_forward_expand(gf,
  9146. ggml_cpy(ctx0,
  9147. ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]),
  9148. ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
  9149. ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[2], x->nb[3], 0);
  9150. // TODO: skip computing output earlier for unused tokens
  9151. y = ggml_add(ctx0, y, ggml_mul(ctx0, cur, layer.ssm_d));
  9152. y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
  9153. // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
  9154. cur = build_lora_mm(layer.ssm_out, y);
  9155. }
  9156. // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
  9157. cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
  9158. return cur;
  9159. }
  9160. ggml_tensor * build_mamba2_layer(
  9161. llm_graph_input_rs * inp,
  9162. ggml_tensor * cur,
  9163. const llama_model & model,
  9164. const llama_ubatch & ubatch,
  9165. int il) const {
  9166. const auto * mctx_cur = inp->mctx;
  9167. const auto kv_head = mctx_cur->get_head();
  9168. const int64_t d_conv = hparams.ssm_d_conv;
  9169. const int64_t d_inner = hparams.ssm_d_inner;
  9170. const int64_t d_state = hparams.ssm_d_state;
  9171. const int64_t n_head = hparams.ssm_dt_rank;
  9172. const int64_t head_dim = d_inner / n_head;
  9173. const int64_t n_group = hparams.ssm_n_group;
  9174. const int64_t n_seqs = ubatch.n_seqs;
  9175. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  9176. GGML_ASSERT(n_seqs != 0);
  9177. GGML_ASSERT(ubatch.equal_seqs());
  9178. GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
  9179. ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
  9180. ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
  9181. ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
  9182. conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
  9183. // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
  9184. cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
  9185. // d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
  9186. // {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
  9187. ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
  9188. // split the above in three
  9189. ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*zxBCdt->nb[0], zxBCdt->nb[1], zxBCdt->nb[2], 0);
  9190. ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_inner + 2*n_group*d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], d_inner*ggml_element_size(zxBCdt));
  9191. ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], (2*d_inner + 2*n_group*d_state)*ggml_element_size(zxBCdt));
  9192. // conv
  9193. {
  9194. // => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs}
  9195. ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0);
  9196. // copy last (d_conv - 1) columns back into the state cache
  9197. ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
  9198. ggml_build_forward_expand(gf,
  9199. ggml_cpy(ctx0, last_conv,
  9200. ggml_view_1d(ctx0, conv_states_all,
  9201. (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
  9202. kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
  9203. // 1D convolution
  9204. // The equivalent is to make a self-overlapping view of conv_x
  9205. // over d_conv columns at each stride in the 3rd dimension,
  9206. // then element-wise multiply that with the conv1d weight,
  9207. // then sum the elements of each row,
  9208. // (the last two steps are a dot product over rows (also doable with mul_mat))
  9209. // then permute away the ne[0] dimension,
  9210. // and then you're left with the resulting x tensor.
  9211. // For simultaneous sequences, all sequences need to have the same length.
  9212. xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
  9213. // bias
  9214. xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b);
  9215. xBC = ggml_silu(ctx0, xBC);
  9216. }
  9217. // ssm
  9218. {
  9219. // These correspond to V K Q in SSM/attention duality
  9220. ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*xBC->nb[0], xBC->nb[1], xBC->nb[2], 0);
  9221. ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], d_inner*ggml_element_size(xBC));
  9222. ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], (d_inner + n_group*d_state)*ggml_element_size(xBC));
  9223. // {n_head, n_seq_tokens, n_seqs}
  9224. dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b);
  9225. ggml_tensor * A = model.layers[il].ssm_a;
  9226. // use the states and the indices provided by build_recurrent_state
  9227. // (this is necessary in order to properly use the states before they are overwritten,
  9228. // while avoiding to make unnecessary copies of the states)
  9229. auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
  9230. ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
  9231. // TODO: use semistructured matrices to implement state-space duality
  9232. // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
  9233. return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
  9234. };
  9235. ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
  9236. // store last states
  9237. ggml_build_forward_expand(gf,
  9238. ggml_cpy(ctx0,
  9239. ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, ggml_nelements(x)*x->nb[0]),
  9240. ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
  9241. ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head*x->nb[1], n_seq_tokens*n_head*x->nb[1], 0);
  9242. // TODO: skip computing output earlier for unused tokens
  9243. y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
  9244. y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
  9245. // grouped RMS norm
  9246. if (model.layers[il].ssm_norm) {
  9247. y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
  9248. y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
  9249. }
  9250. y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
  9251. // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
  9252. cur = build_lora_mm(model.layers[il].ssm_out, y);
  9253. }
  9254. // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
  9255. cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
  9256. cb(cur, "mamba_out", il);
  9257. return cur;
  9258. }
  9259. };
  9260. struct llm_build_mamba : public llm_graph_context_mamba {
  9261. llm_build_mamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
  9262. ggml_tensor * cur;
  9263. ggml_tensor * inpL;
  9264. // {n_embd, n_tokens}
  9265. inpL = build_inp_embd(model.tok_embd);
  9266. auto * rs_inp = build_rs_inp();
  9267. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9268. for (int il = 0; il < n_layer; ++il) {
  9269. // norm
  9270. cur = build_norm(inpL,
  9271. model.layers[il].attn_norm, NULL,
  9272. LLM_NORM_RMS, il);
  9273. cb(cur, "attn_norm", il);
  9274. if (model.arch == LLM_ARCH_MAMBA2) {
  9275. cur = build_mamba2_layer(rs_inp, cur, model, ubatch, il);
  9276. } else {
  9277. cur = build_mamba_layer(rs_inp, cur, model, ubatch, il);
  9278. }
  9279. if (il == n_layer - 1 && inp_out_ids) {
  9280. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9281. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  9282. }
  9283. // residual
  9284. cur = ggml_add(ctx0, cur, inpL);
  9285. cur = build_cvec(cur, il);
  9286. cb(cur, "l_out", il);
  9287. // input for next layer
  9288. inpL = cur;
  9289. }
  9290. // final rmsnorm
  9291. cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
  9292. cb(cur, "result_norm", -1);
  9293. res->t_embd = cur;
  9294. // lm_head
  9295. cur = build_lora_mm(model.output, cur);
  9296. cb(cur, "result_output", -1);
  9297. res->t_logits = cur;
  9298. ggml_build_forward_expand(gf, cur);
  9299. }
  9300. };
  9301. struct llm_build_jamba : public llm_graph_context_mamba {
  9302. llm_build_jamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
  9303. const int64_t n_embd_head = hparams.n_embd_head_v;
  9304. ggml_tensor * cur;
  9305. ggml_tensor * inpL;
  9306. // {n_embd, n_tokens}
  9307. inpL = build_inp_embd(model.tok_embd);
  9308. auto * inp_hybrid = build_inp_mem_hybrid();
  9309. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9310. for (int il = 0; il < n_layer; ++il) {
  9311. const int64_t n_head_kv = hparams.n_head_kv(il);
  9312. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  9313. cb(cur, "attn_norm", il);
  9314. if (n_head_kv == 0) {
  9315. cur = build_mamba_layer(inp_hybrid->get_recr(), cur, model, ubatch, il);
  9316. } else {
  9317. // Attention
  9318. struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  9319. struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  9320. struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  9321. cb(Qcur, "Qcur", il);
  9322. cb(Kcur, "Kcur", il);
  9323. cb(Vcur, "Vcur", il);
  9324. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9325. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9326. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9327. cb(Qcur, "Qcur", il);
  9328. cb(Kcur, "Kcur", il);
  9329. cb(Vcur, "Vcur", il);
  9330. // No RoPE :)
  9331. cur = build_attn(inp_hybrid->get_attn(),
  9332. model.layers[il].wo, NULL,
  9333. Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
  9334. }
  9335. if (il == n_layer - 1 && inp_out_ids) {
  9336. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9337. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  9338. }
  9339. // residual
  9340. struct ggml_tensor * ffn_inp = ggml_add(ctx0, inpL, cur);
  9341. cb(cur, "ffn_inp", il);
  9342. cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
  9343. cb(cur, "ffn_norm", il);
  9344. // feed-forward network
  9345. if (model.layers[il].ffn_gate_inp == nullptr) {
  9346. // FFN
  9347. cur = build_ffn(cur,
  9348. model.layers[il].ffn_up, NULL, NULL,
  9349. model.layers[il].ffn_gate, NULL, NULL,
  9350. model.layers[il].ffn_down, NULL, NULL,
  9351. NULL,
  9352. LLM_FFN_SILU, LLM_FFN_PAR, il);
  9353. cb(cur, "ffn_out", il);
  9354. } else {
  9355. // MoE branch
  9356. cur = build_moe_ffn(cur,
  9357. model.layers[il].ffn_gate_inp,
  9358. model.layers[il].ffn_up_exps,
  9359. model.layers[il].ffn_gate_exps,
  9360. model.layers[il].ffn_down_exps,
  9361. nullptr,
  9362. n_expert, n_expert_used,
  9363. LLM_FFN_SILU, false,
  9364. false, 0.0,
  9365. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  9366. il);
  9367. cb(cur, "ffn_moe_out", il);
  9368. }
  9369. // residual
  9370. cur = ggml_add(ctx0, ffn_inp, cur);
  9371. cur = build_cvec(cur, il);
  9372. cb(cur, "l_out", il);
  9373. // input for next layer
  9374. inpL = cur;
  9375. }
  9376. // final rmsnorm
  9377. cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
  9378. cb(cur, "result_norm", -1);
  9379. res->t_embd = cur;
  9380. // lm_head
  9381. cur = build_lora_mm(model.output, cur);
  9382. cb(cur, "result_output", -1);
  9383. res->t_logits = cur;
  9384. ggml_build_forward_expand(gf, cur);
  9385. }
  9386. };
  9387. struct llm_build_command_r : public llm_graph_context {
  9388. llm_build_command_r(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  9389. const int64_t n_embd_head = hparams.n_embd_head_v;
  9390. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9391. const float f_logit_scale = hparams.f_logit_scale;
  9392. ggml_tensor * cur;
  9393. ggml_tensor * inpL;
  9394. inpL = build_inp_embd(model.tok_embd);
  9395. // inp_pos - contains the positions
  9396. ggml_tensor * inp_pos = build_inp_pos();
  9397. auto * inp_attn = build_attn_inp_kv();
  9398. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9399. for (int il = 0; il < n_layer; ++il) {
  9400. // norm
  9401. cur = build_norm(inpL,
  9402. model.layers[il].attn_norm, NULL,
  9403. LLM_NORM, il);
  9404. cb(cur, "attn_norm", il);
  9405. ggml_tensor * ffn_inp = cur;
  9406. // self-attention
  9407. {
  9408. // compute Q and K and RoPE them
  9409. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  9410. cb(Qcur, "Qcur", il);
  9411. if (model.layers[il].bq) {
  9412. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  9413. cb(Qcur, "Qcur", il);
  9414. }
  9415. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  9416. cb(Kcur, "Kcur", il);
  9417. if (model.layers[il].bk) {
  9418. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  9419. cb(Kcur, "Kcur", il);
  9420. }
  9421. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  9422. cb(Vcur, "Vcur", il);
  9423. if (model.layers[il].bv) {
  9424. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  9425. cb(Vcur, "Vcur", il);
  9426. }
  9427. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9428. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9429. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9430. if (model.layers[il].attn_q_norm) {
  9431. Qcur = build_norm(Qcur,
  9432. model.layers[il].attn_q_norm,
  9433. NULL,
  9434. LLM_NORM, il);
  9435. cb(Qcur, "Qcur", il);
  9436. }
  9437. Qcur = ggml_rope_ext(
  9438. ctx0, Qcur, inp_pos, nullptr,
  9439. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9440. ext_factor, attn_factor, beta_fast, beta_slow
  9441. );
  9442. if (model.layers[il].attn_k_norm) {
  9443. Kcur = build_norm(Kcur,
  9444. model.layers[il].attn_k_norm,
  9445. NULL,
  9446. LLM_NORM, il);
  9447. cb(Kcur, "Kcur", il);
  9448. }
  9449. Kcur = ggml_rope_ext(
  9450. ctx0, Kcur, inp_pos, nullptr,
  9451. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9452. ext_factor, attn_factor, beta_fast, beta_slow
  9453. );
  9454. cb(Qcur, "Qcur", il);
  9455. cb(Kcur, "Kcur", il);
  9456. cb(Vcur, "Vcur", il);
  9457. cur = build_attn(inp_attn,
  9458. model.layers[il].wo, model.layers[il].bo,
  9459. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  9460. }
  9461. if (il == n_layer - 1 && inp_out_ids) {
  9462. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9463. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  9464. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  9465. }
  9466. ggml_tensor * attn_out = cur;
  9467. // feed-forward network
  9468. {
  9469. cur = build_ffn(ffn_inp,
  9470. model.layers[il].ffn_up, NULL, NULL,
  9471. model.layers[il].ffn_gate, NULL, NULL,
  9472. model.layers[il].ffn_down, NULL, NULL,
  9473. NULL,
  9474. LLM_FFN_SILU, LLM_FFN_PAR, il);
  9475. cb(cur, "ffn_out", il);
  9476. }
  9477. // add together residual + FFN + self-attention
  9478. cur = ggml_add(ctx0, cur, inpL);
  9479. cur = ggml_add(ctx0, cur, attn_out);
  9480. cur = build_cvec(cur, il);
  9481. cb(cur, "l_out", il);
  9482. // input for next layer
  9483. inpL = cur;
  9484. }
  9485. cur = inpL;
  9486. cur = build_norm(cur,
  9487. model.output_norm, NULL,
  9488. LLM_NORM, -1);
  9489. cb(cur, "result_norm", -1);
  9490. res->t_embd = cur;
  9491. // lm_head
  9492. cur = build_lora_mm(model.output, cur);
  9493. if (f_logit_scale) {
  9494. cur = ggml_scale(ctx0, cur, f_logit_scale);
  9495. }
  9496. cb(cur, "result_output", -1);
  9497. res->t_logits = cur;
  9498. ggml_build_forward_expand(gf, cur);
  9499. }
  9500. };
  9501. struct llm_build_cohere2_iswa : public llm_graph_context {
  9502. llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  9503. const int64_t n_embd_head = hparams.n_embd_head_v;
  9504. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9505. const float f_logit_scale = hparams.f_logit_scale;
  9506. ggml_tensor * cur;
  9507. ggml_tensor * inpL;
  9508. inpL = build_inp_embd(model.tok_embd);
  9509. // inp_pos - contains the positions
  9510. ggml_tensor * inp_pos = build_inp_pos();
  9511. auto * inp_attn = build_attn_inp_kv_iswa();
  9512. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9513. for (int il = 0; il < n_layer; ++il) {
  9514. const bool is_swa = hparams.is_swa(il);
  9515. // norm
  9516. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
  9517. cb(cur, "attn_norm", il);
  9518. ggml_tensor * ffn_inp = cur;
  9519. // self-attention
  9520. {
  9521. // rope freq factors for 128k context
  9522. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  9523. // compute Q and K and RoPE them
  9524. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  9525. cb(Qcur, "Qcur", il);
  9526. if (model.layers[il].bq) {
  9527. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  9528. cb(Qcur, "Qcur", il);
  9529. }
  9530. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  9531. cb(Kcur, "Kcur", il);
  9532. if (model.layers[il].bk) {
  9533. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  9534. cb(Kcur, "Kcur", il);
  9535. }
  9536. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  9537. cb(Vcur, "Vcur", il);
  9538. if (model.layers[il].bv) {
  9539. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  9540. cb(Vcur, "Vcur", il);
  9541. }
  9542. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9543. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9544. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9545. if (is_swa) {
  9546. Qcur = ggml_rope_ext(
  9547. ctx0, Qcur, inp_pos, rope_factors,
  9548. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9549. ext_factor, attn_factor, beta_fast, beta_slow
  9550. );
  9551. Kcur = ggml_rope_ext(
  9552. ctx0, Kcur, inp_pos, rope_factors,
  9553. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9554. ext_factor, attn_factor, beta_fast, beta_slow
  9555. );
  9556. }
  9557. cb(Qcur, "Qcur", il);
  9558. cb(Kcur, "Kcur", il);
  9559. cb(Vcur, "Vcur", il);
  9560. cur = build_attn(inp_attn,
  9561. model.layers[il].wo, model.layers[il].bo,
  9562. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  9563. }
  9564. if (il == n_layer - 1 && inp_out_ids) {
  9565. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9566. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  9567. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  9568. }
  9569. ggml_tensor * attn_out = cur;
  9570. // feed-forward network
  9571. {
  9572. cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
  9573. NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR,
  9574. il);
  9575. cb(cur, "ffn_out", il);
  9576. }
  9577. // add together residual + FFN + self-attention
  9578. cur = ggml_add(ctx0, cur, inpL);
  9579. cur = ggml_add(ctx0, cur, attn_out);
  9580. cur = build_cvec(cur, il);
  9581. cb(cur, "l_out", il);
  9582. // input for next layer
  9583. inpL = cur;
  9584. }
  9585. cur = inpL;
  9586. cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1);
  9587. cb(cur, "result_norm", -1);
  9588. res->t_embd = cur;
  9589. // lm_head
  9590. cur = build_lora_mm(model.output, cur);
  9591. if (f_logit_scale) {
  9592. cur = ggml_scale(ctx0, cur, f_logit_scale);
  9593. }
  9594. cb(cur, "result_output", -1);
  9595. res->t_logits = cur;
  9596. ggml_build_forward_expand(gf, cur);
  9597. }
  9598. };
  9599. // ref: https://allenai.org/olmo
  9600. // based on the original build_llama() function, changes:
  9601. // * non-parametric layer norm
  9602. // * clamp qkv
  9603. // * removed bias
  9604. // * removed MoE
  9605. struct llm_build_olmo : public llm_graph_context {
  9606. llm_build_olmo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  9607. const int64_t n_embd_head = hparams.n_embd_head_v;
  9608. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9609. GGML_ASSERT(n_embd_head == hparams.n_rot);
  9610. ggml_tensor * cur;
  9611. ggml_tensor * inpL;
  9612. inpL = build_inp_embd(model.tok_embd);
  9613. // inp_pos - contains the positions
  9614. ggml_tensor * inp_pos = build_inp_pos();
  9615. auto * inp_attn = build_attn_inp_kv();
  9616. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9617. for (int il = 0; il < n_layer; ++il) {
  9618. ggml_tensor * inpSA = inpL;
  9619. // norm
  9620. cur = build_norm(inpL,
  9621. NULL, NULL,
  9622. LLM_NORM, il);
  9623. cb(cur, "attn_norm", il);
  9624. // self-attention
  9625. {
  9626. // compute Q and K and RoPE them
  9627. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  9628. cb(Qcur, "Qcur", il);
  9629. if (hparams.f_clamp_kqv > 0.0f) {
  9630. Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  9631. cb(Qcur, "Qcur", il);
  9632. }
  9633. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  9634. cb(Kcur, "Kcur", il);
  9635. if (hparams.f_clamp_kqv > 0.0f) {
  9636. Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  9637. cb(Kcur, "Kcur", il);
  9638. }
  9639. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  9640. cb(Vcur, "Vcur", il);
  9641. if (hparams.f_clamp_kqv > 0.0f) {
  9642. Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  9643. cb(Vcur, "Vcur", il);
  9644. }
  9645. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9646. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9647. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9648. Qcur = ggml_rope_ext(
  9649. ctx0, Qcur, inp_pos, nullptr,
  9650. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9651. ext_factor, attn_factor, beta_fast, beta_slow
  9652. );
  9653. Kcur = ggml_rope_ext(
  9654. ctx0, Kcur, inp_pos, nullptr,
  9655. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9656. ext_factor, attn_factor, beta_fast, beta_slow
  9657. );
  9658. cb(Qcur, "Qcur", il);
  9659. cb(Kcur, "Kcur", il);
  9660. cb(Vcur, "Vcur", il);
  9661. cur = build_attn(inp_attn,
  9662. model.layers[il].wo, nullptr,
  9663. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  9664. }
  9665. if (il == n_layer - 1 && inp_out_ids) {
  9666. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9667. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9668. }
  9669. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9670. cb(ffn_inp, "ffn_inp", il);
  9671. // feed-forward network
  9672. cur = build_norm(ffn_inp,
  9673. NULL, NULL,
  9674. LLM_NORM, il);
  9675. cb(cur, "ffn_norm", il);
  9676. cur = build_ffn(cur,
  9677. model.layers[il].ffn_up, NULL, NULL,
  9678. model.layers[il].ffn_gate, NULL, NULL,
  9679. model.layers[il].ffn_down, NULL, NULL,
  9680. NULL,
  9681. LLM_FFN_SILU, LLM_FFN_PAR, il);
  9682. cb(cur, "ffn_out", il);
  9683. cur = ggml_add(ctx0, cur, ffn_inp);
  9684. cb(cur, "ffn_out", il);
  9685. cur = build_cvec(cur, il);
  9686. cb(cur, "l_out", il);
  9687. // input for next layer
  9688. inpL = cur;
  9689. }
  9690. cur = inpL;
  9691. cur = build_norm(cur,
  9692. NULL, NULL,
  9693. LLM_NORM, -1);
  9694. cb(cur, "result_norm", -1);
  9695. res->t_embd = cur;
  9696. // lm_head
  9697. cur = build_lora_mm(model.output, cur);
  9698. cb(cur, "result_output", -1);
  9699. res->t_logits = cur;
  9700. ggml_build_forward_expand(gf, cur);
  9701. }
  9702. };
  9703. struct llm_build_olmo2 : public llm_graph_context {
  9704. llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  9705. const int64_t n_embd_head = hparams.n_embd_head_v;
  9706. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9707. GGML_ASSERT(n_embd_head == hparams.n_rot);
  9708. ggml_tensor * cur;
  9709. ggml_tensor * inpL;
  9710. inpL = build_inp_embd(model.tok_embd);
  9711. // inp_pos - contains the positions
  9712. ggml_tensor * inp_pos = build_inp_pos();
  9713. auto * inp_attn = build_attn_inp_kv();
  9714. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9715. for (int il = 0; il < n_layer; ++il) {
  9716. ggml_tensor * inpSA = inpL;
  9717. cur = inpL;
  9718. // self_attention
  9719. {
  9720. // compute Q and K and RoPE them
  9721. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  9722. cb(Qcur, "Qcur", il);
  9723. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  9724. cb(Kcur, "Kcur", il);
  9725. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  9726. cb(Vcur, "Vcur", il);
  9727. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
  9728. LLM_NORM_RMS, il);
  9729. cb(Qcur, "Qcur_normed", il);
  9730. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
  9731. LLM_NORM_RMS, il);
  9732. cb(Kcur, "Kcur_normed", il);
  9733. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9734. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9735. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9736. Qcur = ggml_rope_ext(
  9737. ctx0, Qcur, inp_pos, nullptr,
  9738. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9739. ext_factor, attn_factor, beta_fast, beta_slow
  9740. );
  9741. Kcur = ggml_rope_ext(
  9742. ctx0, Kcur, inp_pos, nullptr,
  9743. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9744. ext_factor, attn_factor, beta_fast, beta_slow
  9745. );
  9746. cb(Qcur, "Qcur", il);
  9747. cb(Kcur, "Kcur", il);
  9748. cb(Vcur, "Vcur", il);
  9749. cur = build_attn(inp_attn,
  9750. model.layers[il].wo, NULL,
  9751. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  9752. }
  9753. if (il == n_layer - 1 && inp_out_ids) {
  9754. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9755. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9756. }
  9757. cur = build_norm(cur,
  9758. model.layers[il].attn_post_norm, NULL,
  9759. LLM_NORM_RMS, il);
  9760. cb(cur, "attn_post_norm", il);
  9761. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9762. cb(ffn_inp, "ffn_inp", il);
  9763. // feed-forward network
  9764. cur = build_ffn(ffn_inp,
  9765. model.layers[il].ffn_up, NULL, NULL,
  9766. model.layers[il].ffn_gate, NULL, NULL,
  9767. model.layers[il].ffn_down, NULL, NULL,
  9768. NULL,
  9769. LLM_FFN_SILU, LLM_FFN_PAR, il);
  9770. cb(cur, "ffn_out", il);
  9771. cur = build_norm(cur,
  9772. model.layers[il].ffn_post_norm, NULL,
  9773. LLM_NORM_RMS, -1);
  9774. cb(cur, "ffn_post_norm", -1);
  9775. cur = ggml_add(ctx0, cur, ffn_inp);
  9776. cb(cur, "ffn_out", il);
  9777. cur = build_cvec(cur, il);
  9778. cb(cur, "l_out", il);
  9779. // input for next layer
  9780. inpL = cur;
  9781. }
  9782. cur = inpL;
  9783. cur = build_norm(cur,
  9784. model.output_norm, NULL,
  9785. LLM_NORM_RMS, -1);
  9786. cb(cur, "result_norm", -1);
  9787. res->t_embd = cur;
  9788. // lm_head
  9789. cur = build_lora_mm(model.output, cur);
  9790. cb(cur, "result_output", -1);
  9791. res->t_logits = cur;
  9792. ggml_build_forward_expand(gf, cur);
  9793. }
  9794. };
  9795. // based on the build_qwen2moe() function, changes:
  9796. // * removed shared experts
  9797. // * removed bias
  9798. // * added q, k norm
  9799. struct llm_build_olmoe : public llm_graph_context {
  9800. llm_build_olmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  9801. const int64_t n_embd_head = hparams.n_embd_head_v;
  9802. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9803. GGML_ASSERT(n_embd_head == hparams.n_rot);
  9804. ggml_tensor * cur;
  9805. ggml_tensor * inpL;
  9806. inpL = build_inp_embd(model.tok_embd);
  9807. // inp_pos - contains the positions
  9808. ggml_tensor * inp_pos = build_inp_pos();
  9809. auto * inp_attn = build_attn_inp_kv();
  9810. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9811. for (int il = 0; il < n_layer; ++il) {
  9812. ggml_tensor * inpSA = inpL;
  9813. // norm
  9814. cur = build_norm(inpL,
  9815. model.layers[il].attn_norm, NULL,
  9816. LLM_NORM_RMS, il);
  9817. cb(cur, "attn_norm", il);
  9818. // self_attention
  9819. {
  9820. // compute Q and K and RoPE them
  9821. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  9822. cb(Qcur, "Qcur", il);
  9823. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  9824. cb(Kcur, "Kcur", il);
  9825. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  9826. cb(Vcur, "Vcur", il);
  9827. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
  9828. LLM_NORM_RMS, il);
  9829. cb(Qcur, "Qcur_normed", il);
  9830. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
  9831. LLM_NORM_RMS, il);
  9832. cb(Kcur, "Kcur_normed", il);
  9833. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9834. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9835. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9836. Qcur = ggml_rope_ext(
  9837. ctx0, Qcur, inp_pos, nullptr,
  9838. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9839. ext_factor, attn_factor, beta_fast, beta_slow
  9840. );
  9841. Kcur = ggml_rope_ext(
  9842. ctx0, Kcur, inp_pos, nullptr,
  9843. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9844. ext_factor, attn_factor, beta_fast, beta_slow
  9845. );
  9846. cb(Qcur, "Qcur", il);
  9847. cb(Kcur, "Kcur", il);
  9848. cb(Vcur, "Vcur", il);
  9849. cur = build_attn(inp_attn,
  9850. model.layers[il].wo, NULL,
  9851. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  9852. }
  9853. if (il == n_layer - 1 && inp_out_ids) {
  9854. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9855. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9856. }
  9857. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9858. cb(ffn_inp, "ffn_inp", il);
  9859. // MoE branch
  9860. cur = build_norm(ffn_inp,
  9861. model.layers[il].ffn_norm, NULL,
  9862. LLM_NORM_RMS, il);
  9863. cb(cur, "ffn_norm", il);
  9864. cur = build_moe_ffn(cur,
  9865. model.layers[il].ffn_gate_inp,
  9866. model.layers[il].ffn_up_exps,
  9867. model.layers[il].ffn_gate_exps,
  9868. model.layers[il].ffn_down_exps,
  9869. nullptr,
  9870. n_expert, n_expert_used,
  9871. LLM_FFN_SILU, false,
  9872. false, 0.0,
  9873. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  9874. il);
  9875. cb(cur, "ffn_moe_out", il);
  9876. cur = ggml_add(ctx0, cur, ffn_inp);
  9877. cur = build_cvec(cur, il);
  9878. cb(cur, "l_out", il);
  9879. // input for next layer
  9880. inpL = cur;
  9881. }
  9882. cur = inpL;
  9883. cur = build_norm(cur,
  9884. model.output_norm, NULL,
  9885. LLM_NORM_RMS, -1);
  9886. cb(cur, "result_norm", -1);
  9887. res->t_embd = cur;
  9888. // lm_head
  9889. cur = build_lora_mm(model.output, cur);
  9890. cb(cur, "result_output", -1);
  9891. res->t_logits = cur;
  9892. ggml_build_forward_expand(gf, cur);
  9893. }
  9894. };
  9895. struct llm_build_openelm : public llm_graph_context {
  9896. llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  9897. const int64_t n_embd_head = hparams.n_embd_head_v;
  9898. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9899. ggml_tensor * cur;
  9900. ggml_tensor * inpL;
  9901. inpL = build_inp_embd(model.tok_embd);
  9902. // inp_pos - contains the positions
  9903. ggml_tensor * inp_pos = build_inp_pos();
  9904. auto * inp_attn = build_attn_inp_kv();
  9905. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9906. for (int il = 0; il < n_layer; ++il) {
  9907. const int64_t n_head = hparams.n_head(il);
  9908. const int64_t n_head_kv = hparams.n_head_kv(il);
  9909. const int64_t n_head_qkv = 2*n_head_kv + n_head;
  9910. cur = inpL;
  9911. ggml_tensor * residual = cur;
  9912. // norm
  9913. cur = build_norm(inpL,
  9914. model.layers[il].attn_norm, NULL,
  9915. LLM_NORM_RMS, il);
  9916. cb(cur, "attn_norm", il);
  9917. // self-attention
  9918. {
  9919. cur = build_lora_mm(model.layers[il].wqkv, cur);
  9920. cb(cur, "wqkv", il);
  9921. cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
  9922. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0);
  9923. cb(Qcur, "Qcur", il);
  9924. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head);
  9925. cb(Kcur, "Kcur", il);
  9926. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
  9927. cb(Vcur, "Vcur", il);
  9928. Qcur = build_norm(Qcur,
  9929. model.layers[il].attn_q_norm, NULL,
  9930. LLM_NORM_RMS, il);
  9931. cb(Qcur, "Qcur", il);
  9932. Kcur = build_norm(Kcur,
  9933. model.layers[il].attn_k_norm, NULL,
  9934. LLM_NORM_RMS, il);
  9935. cb(Kcur, "Kcur", il);
  9936. Qcur = ggml_rope_ext(
  9937. ctx0, Qcur, inp_pos, NULL,
  9938. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9939. ext_factor, attn_factor, beta_fast, beta_slow
  9940. );
  9941. Kcur = ggml_rope_ext(
  9942. ctx0, Kcur, inp_pos, NULL,
  9943. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9944. ext_factor, attn_factor, beta_fast, beta_slow
  9945. );
  9946. cb(Qcur, "Qcur", il);
  9947. cb(Kcur, "Kcur", il);
  9948. cb(Qcur, "Vcur", il);
  9949. cur = build_attn(inp_attn,
  9950. model.layers[il].wo, NULL,
  9951. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  9952. }
  9953. if (il == n_layer - 1 && inp_out_ids) {
  9954. residual = ggml_get_rows(ctx0, residual, inp_out_ids);
  9955. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9956. }
  9957. ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
  9958. cb(ffn_inp, "ffn_inp", il);
  9959. // feed-forward network
  9960. {
  9961. cur = build_norm(ffn_inp,
  9962. model.layers[il].ffn_norm, NULL,
  9963. LLM_NORM_RMS, il);
  9964. cb(cur, "ffn_norm", il);
  9965. cur = build_ffn(cur,
  9966. model.layers[il].ffn_up, NULL, NULL,
  9967. model.layers[il].ffn_gate, NULL, NULL,
  9968. model.layers[il].ffn_down, NULL, NULL,
  9969. NULL,
  9970. LLM_FFN_SILU, LLM_FFN_PAR, il);
  9971. cb(cur, "ffn_out", il);
  9972. }
  9973. cur = ggml_add(ctx0, cur, ffn_inp);
  9974. cur = build_cvec(cur, il);
  9975. cb(cur, "l_out", il);
  9976. inpL = cur;
  9977. }
  9978. cur = inpL;
  9979. // norm
  9980. cur = build_norm(cur,
  9981. model.output_norm, NULL,
  9982. LLM_NORM_RMS, -1);
  9983. cb(cur, "result_norm", -1);
  9984. res->t_embd = cur;
  9985. cur = build_lora_mm(model.output, cur);
  9986. cb(cur, "result_output", -1);
  9987. res->t_logits = cur;
  9988. ggml_build_forward_expand(gf, cur);
  9989. }
  9990. };
  9991. struct llm_build_gptneox : public llm_graph_context {
  9992. llm_build_gptneox(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  9993. const int64_t n_embd_head = hparams.n_embd_head_v;
  9994. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  9995. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9996. ggml_tensor * cur;
  9997. ggml_tensor * inpL;
  9998. inpL = build_inp_embd(model.tok_embd);
  9999. // inp_pos - contains the positions
  10000. ggml_tensor * inp_pos = build_inp_pos();
  10001. auto * inp_attn = build_attn_inp_kv();
  10002. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10003. for (int il = 0; il < n_layer; ++il) {
  10004. cur = build_norm(inpL,
  10005. model.layers[il].attn_norm,
  10006. model.layers[il].attn_norm_b,
  10007. LLM_NORM, il);
  10008. cb(cur, "attn_norm", il);
  10009. // self-attention
  10010. {
  10011. cur = build_lora_mm(model.layers[il].wqkv, cur);
  10012. cb(cur, "wqkv", il);
  10013. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  10014. cb(cur, "bqkv", il);
  10015. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  10016. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  10017. ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  10018. Qcur = ggml_rope_ext(
  10019. ctx0, Qcur, inp_pos, nullptr,
  10020. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10021. ext_factor, attn_factor, beta_fast, beta_slow
  10022. );
  10023. Kcur = ggml_rope_ext(
  10024. ctx0, Kcur, inp_pos, nullptr,
  10025. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10026. ext_factor, attn_factor, beta_fast, beta_slow
  10027. );
  10028. cb(Qcur, "Qcur", il);
  10029. cb(Kcur, "Kcur", il);
  10030. cb(Vcur, "Vcur", il);
  10031. cur = build_attn(inp_attn,
  10032. model.layers[il].wo, model.layers[il].bo,
  10033. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  10034. }
  10035. if (il == n_layer - 1 && inp_out_ids) {
  10036. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10037. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  10038. }
  10039. // ffn
  10040. if (hparams.use_par_res) {
  10041. // attention and ffn are computed in parallel
  10042. // x = x + attn(ln1(x)) + ffn(ln2(x))
  10043. ggml_tensor * attn_out = cur;
  10044. cur = build_norm(inpL,
  10045. model.layers[il].ffn_norm,
  10046. model.layers[il].ffn_norm_b,
  10047. LLM_NORM, il);
  10048. cb(cur, "ffn_norm", il);
  10049. cur = build_ffn(cur,
  10050. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  10051. NULL, NULL, NULL,
  10052. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  10053. NULL,
  10054. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  10055. cb(cur, "ffn_out", il);
  10056. cur = ggml_add(ctx0, cur, inpL);
  10057. cb(cur, "ffn_out", il);
  10058. cur = ggml_add(ctx0, cur, attn_out);
  10059. cur = build_cvec(cur, il);
  10060. cb(cur, "l_out", il);
  10061. // input for next layer
  10062. inpL = cur;
  10063. } else {
  10064. // attention and ffn are computed sequentially
  10065. // x = x + attn(ln1(x))
  10066. // x = x + ffn(ln2(x))
  10067. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  10068. cb(ffn_inp, "ffn_inp", il);
  10069. cur = build_norm(ffn_inp,
  10070. model.layers[il].ffn_norm,
  10071. model.layers[il].ffn_norm_b,
  10072. LLM_NORM, il);
  10073. cb(cur, "ffn_norm", il);
  10074. cur = build_ffn(cur,
  10075. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  10076. NULL, NULL, NULL,
  10077. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  10078. NULL,
  10079. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  10080. cb(cur, "ffn_out", il);
  10081. cur = ggml_add(ctx0, cur, ffn_inp);
  10082. cur = build_cvec(cur, il);
  10083. cb(cur, "l_out", il);
  10084. // input for next layer
  10085. inpL = cur;
  10086. }
  10087. }
  10088. cur = build_norm(inpL,
  10089. model.output_norm,
  10090. model.output_norm_b,
  10091. LLM_NORM, -1);
  10092. cb(cur, "result_norm", -1);
  10093. res->t_embd = cur;
  10094. cur = build_lora_mm(model.output, cur);
  10095. cb(cur, "result_output", -1);
  10096. res->t_logits = cur;
  10097. ggml_build_forward_expand(gf, cur);
  10098. }
  10099. };
  10100. struct llm_build_arctic : public llm_graph_context {
  10101. llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10102. const int64_t n_embd_head = hparams.n_embd_head_v;
  10103. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10104. GGML_ASSERT(n_embd_head == hparams.n_rot);
  10105. ggml_tensor * cur;
  10106. ggml_tensor * inpL;
  10107. inpL = build_inp_embd(model.tok_embd);
  10108. // inp_pos - contains the positions
  10109. ggml_tensor * inp_pos = build_inp_pos();
  10110. auto * inp_attn = build_attn_inp_kv();
  10111. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10112. for (int il = 0; il < n_layer; ++il) {
  10113. ggml_tensor * inpSA = inpL;
  10114. // norm
  10115. cur = build_norm(inpL,
  10116. model.layers[il].attn_norm, NULL,
  10117. LLM_NORM_RMS, il);
  10118. cb(cur, "attn_norm", il);
  10119. // self-attention
  10120. {
  10121. // compute Q and K and RoPE them
  10122. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  10123. cb(Qcur, "Qcur", il);
  10124. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  10125. cb(Kcur, "Kcur", il);
  10126. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  10127. cb(Vcur, "Vcur", il);
  10128. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10129. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10130. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  10131. Qcur = ggml_rope_ext(
  10132. ctx0, Qcur, inp_pos, nullptr,
  10133. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10134. ext_factor, attn_factor, beta_fast, beta_slow
  10135. );
  10136. Kcur = ggml_rope_ext(
  10137. ctx0, Kcur, inp_pos, nullptr,
  10138. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10139. ext_factor, attn_factor, beta_fast, beta_slow
  10140. );
  10141. cb(Qcur, "Qcur", il);
  10142. cb(Kcur, "Kcur", il);
  10143. cb(Vcur, "Vcur", il);
  10144. cur = build_attn(inp_attn,
  10145. model.layers[il].wo, NULL,
  10146. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  10147. }
  10148. if (il == n_layer - 1 && inp_out_ids) {
  10149. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10150. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10151. }
  10152. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10153. cb(ffn_inp, "ffn_inp", il);
  10154. // feed-forward network
  10155. cur = build_norm(ffn_inp,
  10156. model.layers[il].ffn_norm, NULL,
  10157. LLM_NORM_RMS, il);
  10158. cb(cur, "ffn_norm", il);
  10159. cur = build_ffn(cur,
  10160. model.layers[il].ffn_up, NULL, NULL,
  10161. model.layers[il].ffn_gate, NULL, NULL,
  10162. model.layers[il].ffn_down, NULL, NULL,
  10163. NULL,
  10164. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10165. cb(cur, "ffn_out", il);
  10166. ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
  10167. cb(ffn_out, "ffn_out", il);
  10168. // MoE
  10169. cur = build_norm(inpSA,
  10170. model.layers[il].ffn_norm_exps, NULL,
  10171. LLM_NORM_RMS, il);
  10172. cb(cur, "ffn_norm_exps", il);
  10173. cur = build_moe_ffn(cur,
  10174. model.layers[il].ffn_gate_inp,
  10175. model.layers[il].ffn_up_exps,
  10176. model.layers[il].ffn_gate_exps,
  10177. model.layers[il].ffn_down_exps,
  10178. nullptr,
  10179. n_expert, n_expert_used,
  10180. LLM_FFN_SILU, true,
  10181. false, 0.0,
  10182. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  10183. il);
  10184. cb(cur, "ffn_moe_out", il);
  10185. cur = ggml_add(ctx0, cur, ffn_out);
  10186. cb(cur, "ffn_out", il);
  10187. cur = build_cvec(cur, il);
  10188. cb(cur, "l_out", il);
  10189. // input for next layer
  10190. inpL = cur;
  10191. }
  10192. cur = inpL;
  10193. cur = build_norm(cur,
  10194. model.output_norm, NULL,
  10195. LLM_NORM_RMS, -1);
  10196. cb(cur, "result_norm", -1);
  10197. res->t_embd = cur;
  10198. // lm_head
  10199. cur = build_lora_mm(model.output, cur);
  10200. cb(cur, "result_output", -1);
  10201. res->t_logits = cur;
  10202. ggml_build_forward_expand(gf, cur);
  10203. }
  10204. };
  10205. struct llm_build_deepseek : public llm_graph_context {
  10206. llm_build_deepseek(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10207. const int64_t n_embd_head = hparams.n_embd_head_v;
  10208. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10209. GGML_ASSERT(n_embd_head == hparams.n_rot);
  10210. ggml_tensor * cur;
  10211. ggml_tensor * inpL;
  10212. inpL = build_inp_embd(model.tok_embd);
  10213. // inp_pos - contains the positions
  10214. ggml_tensor * inp_pos = build_inp_pos();
  10215. auto * inp_attn = build_attn_inp_kv();
  10216. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  10217. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10218. for (int il = 0; il < n_layer; ++il) {
  10219. ggml_tensor * inpSA = inpL;
  10220. // norm
  10221. cur = build_norm(inpL,
  10222. model.layers[il].attn_norm, NULL,
  10223. LLM_NORM_RMS, il);
  10224. cb(cur, "attn_norm", il);
  10225. // self-attention
  10226. {
  10227. // rope freq factors for llama3; may return nullptr for llama2 and other models
  10228. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  10229. // compute Q and K and RoPE them
  10230. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  10231. cb(Qcur, "Qcur", il);
  10232. if (model.layers[il].bq) {
  10233. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  10234. cb(Qcur, "Qcur", il);
  10235. }
  10236. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  10237. cb(Kcur, "Kcur", il);
  10238. if (model.layers[il].bk) {
  10239. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  10240. cb(Kcur, "Kcur", il);
  10241. }
  10242. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  10243. cb(Vcur, "Vcur", il);
  10244. if (model.layers[il].bv) {
  10245. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  10246. cb(Vcur, "Vcur", il);
  10247. }
  10248. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10249. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10250. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  10251. Qcur = ggml_rope_ext(
  10252. ctx0, Qcur, inp_pos, rope_factors,
  10253. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10254. ext_factor, attn_factor, beta_fast, beta_slow
  10255. );
  10256. Kcur = ggml_rope_ext(
  10257. ctx0, Kcur, inp_pos, rope_factors,
  10258. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10259. ext_factor, attn_factor, beta_fast, beta_slow
  10260. );
  10261. cb(Qcur, "Qcur", il);
  10262. cb(Kcur, "Kcur", il);
  10263. cb(Vcur, "Vcur", il);
  10264. cur = build_attn(inp_attn,
  10265. model.layers[il].wo, model.layers[il].bo,
  10266. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  10267. }
  10268. if (il == n_layer - 1 && inp_out_ids) {
  10269. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10270. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10271. }
  10272. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10273. cb(ffn_inp, "ffn_inp", il);
  10274. cur = build_norm(ffn_inp,
  10275. model.layers[il].ffn_norm, NULL,
  10276. LLM_NORM_RMS, il);
  10277. cb(cur, "ffn_norm", il);
  10278. if ((uint32_t) il < hparams.n_layer_dense_lead) {
  10279. cur = build_ffn(cur,
  10280. model.layers[il].ffn_up, NULL, NULL,
  10281. model.layers[il].ffn_gate, NULL, NULL,
  10282. model.layers[il].ffn_down, NULL, NULL,
  10283. NULL,
  10284. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10285. cb(cur, "ffn_out", il);
  10286. } else {
  10287. // MoE branch
  10288. ggml_tensor * moe_out =
  10289. build_moe_ffn(cur,
  10290. model.layers[il].ffn_gate_inp,
  10291. model.layers[il].ffn_up_exps,
  10292. model.layers[il].ffn_gate_exps,
  10293. model.layers[il].ffn_down_exps,
  10294. nullptr,
  10295. n_expert, n_expert_used,
  10296. LLM_FFN_SILU, false,
  10297. false, hparams.expert_weights_scale,
  10298. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  10299. il);
  10300. cb(moe_out, "ffn_moe_out", il);
  10301. // FFN shared expert
  10302. {
  10303. ggml_tensor * ffn_shexp = build_ffn(cur,
  10304. model.layers[il].ffn_up_shexp, NULL, NULL,
  10305. model.layers[il].ffn_gate_shexp, NULL, NULL,
  10306. model.layers[il].ffn_down_shexp, NULL, NULL,
  10307. NULL,
  10308. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10309. cb(ffn_shexp, "ffn_shexp", il);
  10310. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  10311. cb(cur, "ffn_out", il);
  10312. }
  10313. }
  10314. cur = ggml_add(ctx0, cur, ffn_inp);
  10315. cur = build_cvec(cur, il);
  10316. cb(cur, "l_out", il);
  10317. // input for next layer
  10318. inpL = cur;
  10319. }
  10320. cur = inpL;
  10321. cur = build_norm(cur,
  10322. model.output_norm, NULL,
  10323. LLM_NORM_RMS, -1);
  10324. cb(cur, "result_norm", -1);
  10325. res->t_embd = cur;
  10326. // lm_head
  10327. cur = build_lora_mm(model.output, cur);
  10328. cb(cur, "result_output", -1);
  10329. res->t_logits = cur;
  10330. ggml_build_forward_expand(gf, cur);
  10331. }
  10332. };
  10333. struct llm_build_deepseek2 : public llm_graph_context {
  10334. llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10335. bool is_lite = (hparams.n_layer == 27);
  10336. const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
  10337. // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
  10338. const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
  10339. const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
  10340. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  10341. const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
  10342. const uint32_t kv_lora_rank = hparams.n_lora_kv;
  10343. // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
  10344. // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
  10345. const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
  10346. const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(n_embd_head_k));
  10347. const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
  10348. ggml_tensor * cur;
  10349. ggml_tensor * inpL;
  10350. // {n_embd, n_tokens}
  10351. inpL = build_inp_embd(model.tok_embd);
  10352. // inp_pos - contains the positions
  10353. ggml_tensor * inp_pos = build_inp_pos();
  10354. auto * inp_attn = build_attn_inp_kv();
  10355. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10356. for (int il = 0; il < n_layer; ++il) {
  10357. ggml_tensor * inpSA = inpL;
  10358. // norm
  10359. cur = build_norm(inpL,
  10360. model.layers[il].attn_norm, NULL,
  10361. LLM_NORM_RMS, il);
  10362. cb(cur, "attn_norm", il);
  10363. // self_attention
  10364. {
  10365. ggml_tensor * q = NULL;
  10366. if (!is_lite) {
  10367. q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
  10368. cb(q, "q", il);
  10369. q = build_norm(q,
  10370. model.layers[il].attn_q_a_norm, nullptr,
  10371. LLM_NORM_RMS, il);
  10372. cb(q, "q", il);
  10373. q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
  10374. cb(q, "q", il);
  10375. } else {
  10376. q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
  10377. cb(q, "q", il);
  10378. }
  10379. // split into {n_embd_head_qk_nope, n_head, n_tokens}
  10380. ggml_tensor * q_nope = ggml_view_3d(ctx0, q,
  10381. n_embd_head_qk_nope, n_head, n_tokens,
  10382. ggml_row_size(q->type, n_embd_head_k),
  10383. ggml_row_size(q->type, n_embd_head_k) * n_head,
  10384. 0);
  10385. cb(q_nope, "q_nope", il);
  10386. // and {n_embd_head_qk_rope, n_head, n_tokens}
  10387. ggml_tensor * q_pe = ggml_view_3d(ctx0, q,
  10388. n_embd_head_qk_rope, n_head, n_tokens,
  10389. ggml_row_size(q->type, n_embd_head_k),
  10390. ggml_row_size(q->type, n_embd_head_k) * n_head,
  10391. ggml_row_size(q->type, n_embd_head_qk_nope));
  10392. cb(q_pe, "q_pe", il);
  10393. ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
  10394. cb(kv_cmpr_pe, "kv_cmpr_pe", il);
  10395. // split into {kv_lora_rank, n_tokens}
  10396. ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_cmpr_pe,
  10397. kv_lora_rank, n_tokens,
  10398. ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
  10399. 0);
  10400. cb(kv_cmpr, "kv_cmpr", il);
  10401. // and {n_embd_head_qk_rope, 1, n_tokens}
  10402. ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe,
  10403. n_embd_head_qk_rope, 1, n_tokens,
  10404. ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
  10405. ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
  10406. ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
  10407. cb(k_pe, "k_pe", il);
  10408. q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr,
  10409. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10410. ext_factor, attn_factor, beta_fast, beta_slow
  10411. );
  10412. cb(q_pe, "q_pe", il);
  10413. k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr,
  10414. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10415. ext_factor, attn_factor, beta_fast, beta_slow
  10416. );
  10417. cb(k_pe, "k_pe", il);
  10418. kv_cmpr = build_norm(kv_cmpr,
  10419. model.layers[il].attn_kv_a_norm, nullptr,
  10420. LLM_NORM_RMS, il);
  10421. cb(kv_cmpr, "kv_cmpr", il);
  10422. if (is_mla) {
  10423. // {n_embd_head_qk_nope, n_tokens, n_head}
  10424. q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
  10425. cb(q_nope, "q_nope_perm", il);
  10426. // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
  10427. ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
  10428. cb(q_nope_absorbed, "q_nope_absorbed", il);
  10429. // {kv_lora_rank, n_head, n_tokens}
  10430. q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
  10431. cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
  10432. // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
  10433. // note: rope must go first for in-place context shifting in build_rope_shift()
  10434. ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
  10435. cb(Qcur, "Qcur", il);
  10436. kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
  10437. cb(kv_cmpr, "kv_cmpr_reshape", il);
  10438. // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
  10439. ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
  10440. cb(Kcur, "Kcur", il);
  10441. // {kv_lora_rank, 1, n_tokens}
  10442. ggml_tensor * Vcur = kv_cmpr;
  10443. cb(Vcur, "Vcur", il);
  10444. // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
  10445. cur = build_attn(inp_attn,
  10446. model.layers[il].wo, NULL,
  10447. Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
  10448. } else {
  10449. ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
  10450. cb(kv, "kv", il);
  10451. // split into {n_embd_head_qk_nope, n_head, n_tokens}
  10452. ggml_tensor * k_nope = ggml_view_3d(ctx0, kv,
  10453. n_embd_head_qk_nope, n_head, n_tokens,
  10454. ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
  10455. ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
  10456. 0);
  10457. cb(k_nope, "k_nope_view", il);
  10458. // and {n_embd_head_v, n_head, n_tokens}
  10459. ggml_tensor * Vcur = ggml_view_3d(ctx0, kv,
  10460. n_embd_head_v, n_head, n_tokens,
  10461. ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
  10462. ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
  10463. ggml_row_size(kv->type, n_embd_head_qk_nope));
  10464. cb(Vcur, "Vcur_view", il);
  10465. Vcur = ggml_cont(ctx0, Vcur);
  10466. cb(Vcur, "Vcur_cont", il);
  10467. // note: rope must go first for in-place context shifting in build_rope_shift()
  10468. ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0);
  10469. cb(Qcur, "Qcur", il);
  10470. ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
  10471. cb(Kcur, "Kcur", il);
  10472. // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
  10473. cur = build_attn(inp_attn,
  10474. model.layers[il].wo, NULL,
  10475. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  10476. }
  10477. }
  10478. if (il == n_layer - 1 && inp_out_ids) {
  10479. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10480. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10481. }
  10482. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10483. cb(ffn_inp, "ffn_inp", il);
  10484. cur = build_norm(ffn_inp,
  10485. model.layers[il].ffn_norm, NULL,
  10486. LLM_NORM_RMS, il);
  10487. cb(cur, "ffn_norm", il);
  10488. if ((uint32_t) il < hparams.n_layer_dense_lead) {
  10489. cur = build_ffn(cur,
  10490. model.layers[il].ffn_up, NULL, NULL,
  10491. model.layers[il].ffn_gate, NULL, NULL,
  10492. model.layers[il].ffn_down, NULL, NULL,
  10493. NULL,
  10494. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10495. cb(cur, "ffn_out", il);
  10496. } else {
  10497. // MoE branch
  10498. ggml_tensor * moe_out =
  10499. build_moe_ffn(cur,
  10500. model.layers[il].ffn_gate_inp,
  10501. model.layers[il].ffn_up_exps,
  10502. model.layers[il].ffn_gate_exps,
  10503. model.layers[il].ffn_down_exps,
  10504. model.layers[il].ffn_exp_probs_b,
  10505. n_expert, n_expert_used,
  10506. LLM_FFN_SILU, hparams.expert_weights_norm,
  10507. true, hparams.expert_weights_scale,
  10508. (llama_expert_gating_func_type) hparams.expert_gating_func,
  10509. il);
  10510. cb(moe_out, "ffn_moe_out", il);
  10511. // FFN shared expert
  10512. {
  10513. ggml_tensor * ffn_shexp = build_ffn(cur,
  10514. model.layers[il].ffn_up_shexp, NULL, NULL,
  10515. model.layers[il].ffn_gate_shexp, NULL, NULL,
  10516. model.layers[il].ffn_down_shexp, NULL, NULL,
  10517. NULL,
  10518. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10519. cb(ffn_shexp, "ffn_shexp", il);
  10520. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  10521. cb(cur, "ffn_out", il);
  10522. }
  10523. }
  10524. cur = ggml_add(ctx0, cur, ffn_inp);
  10525. cur = build_cvec(cur, il);
  10526. cb(cur, "l_out", il);
  10527. // input for next layer
  10528. inpL = cur;
  10529. }
  10530. cur = inpL;
  10531. cur = build_norm(cur,
  10532. model.output_norm, NULL,
  10533. LLM_NORM_RMS, -1);
  10534. cb(cur, "result_norm", -1);
  10535. res->t_embd = cur;
  10536. // lm_head
  10537. cur = ggml_mul_mat(ctx0, model.output, cur);
  10538. cb(cur, "result_output", -1);
  10539. res->t_logits = cur;
  10540. ggml_build_forward_expand(gf, cur);
  10541. }
  10542. };
  10543. struct llm_build_bitnet : public llm_graph_context {
  10544. llm_build_bitnet(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10545. const int64_t n_embd_head = hparams.n_embd_head_v;
  10546. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10547. ggml_tensor * cur;
  10548. ggml_tensor * inpL;
  10549. inpL = build_inp_embd(model.tok_embd);
  10550. // inp_pos - contains the positions
  10551. ggml_tensor * inp_pos = build_inp_pos();
  10552. auto * inp_attn = build_attn_inp_kv();
  10553. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10554. for (int il = 0; il < n_layer; ++il) {
  10555. ggml_tensor * inpSA = inpL;
  10556. cur = build_norm(inpL,
  10557. model.layers[il].attn_norm, NULL,
  10558. LLM_NORM_RMS, il);
  10559. cb(cur, "attn_norm", il);
  10560. // self-attention
  10561. {
  10562. // compute Q and K and RoPE them
  10563. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  10564. if (model.layers[il].wq_scale) {
  10565. Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
  10566. }
  10567. cb(Qcur, "Qcur", il);
  10568. if (model.layers[il].bq) {
  10569. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  10570. cb(Qcur, "Qcur", il);
  10571. }
  10572. // B1.K
  10573. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  10574. if (model.layers[il].wk_scale) {
  10575. Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
  10576. }
  10577. cb(Kcur, "Kcur", il);
  10578. if (model.layers[il].bk) {
  10579. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  10580. cb(Kcur, "Kcur", il);
  10581. }
  10582. // B1.V
  10583. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  10584. if (model.layers[il].wv_scale) {
  10585. Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
  10586. }
  10587. cb(Vcur, "Vcur", il);
  10588. if (model.layers[il].bv) {
  10589. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  10590. cb(Vcur, "Vcur", il);
  10591. }
  10592. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10593. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10594. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  10595. Qcur = ggml_rope_ext(
  10596. ctx0, Qcur, inp_pos, nullptr,
  10597. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10598. ext_factor, attn_factor, beta_fast, beta_slow
  10599. );
  10600. Kcur = ggml_rope_ext(
  10601. ctx0, Kcur, inp_pos, nullptr,
  10602. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10603. ext_factor, attn_factor, beta_fast, beta_slow
  10604. );
  10605. cb(Qcur, "Qcur", il);
  10606. cb(Kcur, "Kcur", il);
  10607. cb(Vcur, "Vcur", il);
  10608. cur = build_attn(inp_attn,
  10609. NULL, NULL,
  10610. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  10611. cur = build_norm(cur,
  10612. model.layers[il].attn_sub_norm, NULL,
  10613. LLM_NORM_RMS, il);
  10614. cb(cur, "attn_sub_norm", il);
  10615. cur = build_lora_mm(model.layers[il].wo, cur);
  10616. if (model.layers[il].wo_scale) {
  10617. cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
  10618. }
  10619. if (model.layers[il].bo) {
  10620. cur = ggml_add(ctx0, cur, model.layers[il].bo);
  10621. }
  10622. cb(cur, "attn_o_out", il);
  10623. }
  10624. if (il == n_layer - 1 && inp_out_ids) {
  10625. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10626. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10627. }
  10628. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10629. cb(ffn_inp, "ffn_inp", il);
  10630. // feed-forward forward
  10631. cur = build_norm(ffn_inp,
  10632. model.layers[il].ffn_norm, NULL,
  10633. LLM_NORM_RMS, il);
  10634. cb(cur, "ffn_norm", il);
  10635. cur = build_ffn(cur,
  10636. model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale,
  10637. model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
  10638. NULL, NULL, NULL,
  10639. NULL,
  10640. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10641. cb(cur, "ffn_sub_out", il);
  10642. cur = build_norm(cur,
  10643. model.layers[il].ffn_sub_norm, NULL,
  10644. LLM_NORM_RMS, il);
  10645. cb(cur, "ffn_sub_norm", il);
  10646. cur = build_lora_mm(model.layers[il].ffn_down, cur);
  10647. if (model.layers[il].ffn_down_scale) {
  10648. cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
  10649. }
  10650. cb(cur, "ffn_down", il);
  10651. cur = ggml_add(ctx0, cur, ffn_inp);
  10652. cb(cur, "l_out", il);
  10653. // input for next layer
  10654. inpL = cur;
  10655. }
  10656. cur = inpL;
  10657. cur = build_norm(cur,
  10658. model.output_norm, NULL,
  10659. LLM_NORM_RMS, -1);
  10660. cb(cur, "result_norm", -1);
  10661. res->t_embd = cur;
  10662. // lm_head
  10663. // FIXME: do not use model.tok_embd directly, duplicate as model.output
  10664. cur = build_lora_mm(model.tok_embd, cur);
  10665. cb(cur, "result_output", -1);
  10666. res->t_logits = cur;
  10667. ggml_build_forward_expand(gf, cur);
  10668. }
  10669. };
  10670. struct llm_build_t5_enc : public llm_graph_context {
  10671. llm_build_t5_enc(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10672. const int64_t n_embd_head = hparams.n_embd_head_v;
  10673. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10674. ggml_tensor * cur;
  10675. ggml_tensor * inpL;
  10676. inpL = build_inp_embd(model.tok_embd);
  10677. ggml_tensor * pos_bucket_enc = build_inp_pos_bucket_enc();
  10678. auto * inp_attn = build_attn_inp_no_cache();
  10679. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10680. for (int il = 0; il < n_layer; ++il) {
  10681. ggml_tensor * inpSA = inpL;
  10682. // norm
  10683. cur = build_norm(inpL,
  10684. model.layers[il].attn_norm_enc, NULL,
  10685. LLM_NORM_RMS, il);
  10686. cb(cur, "attn_norm", il);
  10687. // self-attention
  10688. {
  10689. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur);
  10690. cb(Qcur, "Qcur", il);
  10691. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur);
  10692. cb(Kcur, "Kcur", il);
  10693. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur);
  10694. cb(Vcur, "Vcur", il);
  10695. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10696. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10697. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  10698. ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
  10699. ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b);
  10700. cur = build_attn(inp_attn,
  10701. model.layers[il].wo_enc, nullptr,
  10702. Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
  10703. cb(cur, "kqv_out", il);
  10704. }
  10705. if (il == n_layer - 1 && inp_out_ids) {
  10706. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10707. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10708. }
  10709. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10710. cb(ffn_inp, "ffn_inp", il);
  10711. // feed-forward network
  10712. {
  10713. cur = build_norm(ffn_inp,
  10714. model.layers[il].ffn_norm_enc, NULL,
  10715. LLM_NORM_RMS, il);
  10716. cb(cur, "ffn_norm", il);
  10717. // T5 uses relu, flan-T5 uses gelu-gated
  10718. cur = build_ffn(cur,
  10719. model.layers[il].ffn_up_enc, NULL, NULL,
  10720. model.layers[il].ffn_gate_enc, NULL, NULL,
  10721. model.layers[il].ffn_down_enc, NULL, NULL,
  10722. NULL,
  10723. model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
  10724. model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
  10725. il);
  10726. cb(cur, "ffn_out", il);
  10727. }
  10728. cur = ggml_add(ctx0, cur, ffn_inp);
  10729. cb(cur, "ffn_out", il);
  10730. cur = build_cvec(cur, il);
  10731. cb(cur, "l_out", il);
  10732. // input for next layer
  10733. inpL = cur;
  10734. }
  10735. cur = inpL;
  10736. cb(cur, "result_embd", -1);
  10737. cur = build_norm(cur,
  10738. model.output_norm_enc, NULL,
  10739. LLM_NORM_RMS, -1);
  10740. cb(cur, "result_norm", -1);
  10741. res->t_embd = cur;
  10742. ggml_build_forward_expand(gf, cur);
  10743. }
  10744. };
  10745. struct llm_build_t5_dec : public llm_graph_context {
  10746. llm_build_t5_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10747. const int64_t n_embd_head = hparams.n_embd_head_v;
  10748. //const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  10749. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10750. ggml_tensor * cur;
  10751. ggml_tensor * inpL;
  10752. inpL = build_inp_embd(model.tok_embd);
  10753. ggml_tensor * embd_enc = build_inp_cross_embd();
  10754. ggml_tensor * pos_bucket_dec = build_inp_pos_bucket_dec();
  10755. const int64_t n_outputs_enc = embd_enc->ne[1];
  10756. auto * inp_attn_self = build_attn_inp_kv();
  10757. auto * inp_attn_cross = build_attn_inp_cross();
  10758. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10759. for (int il = 0; il < n_layer; ++il) {
  10760. ggml_tensor * inpSA = inpL;
  10761. // norm
  10762. cur = build_norm(inpL,
  10763. model.layers[il].attn_norm, NULL,
  10764. LLM_NORM_RMS, il);
  10765. cb(cur, "attn_norm", il);
  10766. // self-attention
  10767. {
  10768. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  10769. cb(Qcur, "Qcur", il);
  10770. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  10771. cb(Kcur, "Kcur", il);
  10772. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  10773. cb(Vcur, "Vcur", il);
  10774. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10775. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10776. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  10777. ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
  10778. ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b);
  10779. cur = build_attn(inp_attn_self,
  10780. model.layers[il].wo, model.layers[il].bo,
  10781. Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
  10782. cb(cur, "kqv_out", il);
  10783. }
  10784. cur = ggml_add(ctx0, cur, inpSA);
  10785. cb(cur, "cross_inp", il);
  10786. ggml_tensor * inpCA = cur;
  10787. // norm
  10788. cur = build_norm(cur,
  10789. model.layers[il].attn_norm_cross, NULL,
  10790. LLM_NORM_RMS, il);
  10791. cb(cur, "attn_norm_cross", il);
  10792. // cross-attention
  10793. {
  10794. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur);
  10795. cb(Qcur, "Qcur", il);
  10796. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc);
  10797. cb(Kcur, "Kcur", il);
  10798. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc);
  10799. cb(Vcur, "Vcur", il);
  10800. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10801. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
  10802. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_outputs_enc);
  10803. cur = build_attn(inp_attn_cross,
  10804. model.layers[il].wo_cross, nullptr,
  10805. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
  10806. cb(cur, "kqv_out", il);
  10807. //ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
  10808. //ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
  10809. //ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
  10810. //cb(kq, "kq", il);
  10811. //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
  10812. //cb(kq, "kq_soft_max_ext", il);
  10813. //ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
  10814. //cb(v, "v", il);
  10815. //ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
  10816. //cb(kqv, "kqv", il);
  10817. //ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
  10818. //cb(kqv_merged, "kqv_merged", il);
  10819. //cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
  10820. //cb(cur, "kqv_merged_cont", il);
  10821. //ggml_build_forward_expand(gf, cur);
  10822. //cur = build_lora_mm(model.layers[il].wo_cross, cur);
  10823. //cb(cur, "kqv_out", il);
  10824. }
  10825. if (il == n_layer - 1 && inp_out_ids) {
  10826. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10827. inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
  10828. }
  10829. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
  10830. cb(ffn_inp, "ffn_inp", il);
  10831. // feed-forward network
  10832. {
  10833. cur = build_norm(ffn_inp,
  10834. model.layers[il].ffn_norm, NULL,
  10835. LLM_NORM_RMS, il);
  10836. cb(cur, "ffn_norm", il);
  10837. // T5 uses relu, flan-T5 uses gelu-gated
  10838. cur = build_ffn(cur,
  10839. model.layers[il].ffn_up, NULL, NULL,
  10840. model.layers[il].ffn_gate, NULL, NULL,
  10841. model.layers[il].ffn_down, NULL, NULL,
  10842. NULL,
  10843. model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
  10844. model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
  10845. il);
  10846. cb(cur, "ffn_out", il);
  10847. }
  10848. cur = ggml_add(ctx0, cur, ffn_inp);
  10849. cb(cur, "ffn_out", il);
  10850. cur = build_cvec(cur, il);
  10851. cb(cur, "l_out", il);
  10852. // input for next layer
  10853. inpL = cur;
  10854. }
  10855. cur = inpL;
  10856. cb(cur, "result_embd", -1);
  10857. cur = build_norm(cur,
  10858. model.output_norm, NULL,
  10859. LLM_NORM_RMS, -1);
  10860. cb(cur, "result_norm", -1);
  10861. res->t_embd = cur;
  10862. // lm_head
  10863. cur = build_lora_mm(model.output, cur);
  10864. cb(cur, "result_output", -1);
  10865. res->t_logits = cur;
  10866. ggml_build_forward_expand(gf, cur);
  10867. }
  10868. };
  10869. struct llm_build_jais : public llm_graph_context {
  10870. llm_build_jais(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10871. const int64_t n_embd_head = hparams.n_embd_head_v;
  10872. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  10873. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10874. ggml_tensor * cur;
  10875. ggml_tensor * inpL;
  10876. inpL = build_inp_embd(model.tok_embd);
  10877. auto * inp_attn = build_attn_inp_kv();
  10878. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10879. for (int il = 0; il < n_layer; ++il) {
  10880. cur = build_norm(inpL,
  10881. model.layers[il].attn_norm,
  10882. model.layers[il].attn_norm_b,
  10883. LLM_NORM, il);
  10884. cb(cur, "attn_norm", il);
  10885. // self-attention
  10886. {
  10887. cur = build_lora_mm(model.layers[il].wqkv, cur);
  10888. cb(cur, "wqkv", il);
  10889. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  10890. cb(cur, "bqkv", il);
  10891. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*cur->nb[0]*(n_embd));
  10892. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd));
  10893. ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
  10894. cb(Qcur, "Qcur", il);
  10895. cb(Kcur, "Kcur", il);
  10896. cb(Vcur, "Vcur", il);
  10897. cur = build_attn(inp_attn,
  10898. model.layers[il].wo, model.layers[il].bo,
  10899. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
  10900. }
  10901. if (il == n_layer - 1 && inp_out_ids) {
  10902. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10903. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  10904. }
  10905. // add the input
  10906. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  10907. cb(ffn_inp, "ffn_inp", il);
  10908. // FF
  10909. {
  10910. cur = build_norm(ffn_inp,
  10911. model.layers[il].ffn_norm,
  10912. model.layers[il].ffn_norm_b,
  10913. LLM_NORM, il);
  10914. cb(cur, "ffn_norm", il);
  10915. cur = build_ffn(cur,
  10916. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  10917. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  10918. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  10919. NULL,
  10920. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10921. cb(cur, "ffn_out", il);
  10922. }
  10923. inpL = ggml_add(ctx0, cur, ffn_inp);
  10924. cb(inpL, "l_out", il);
  10925. }
  10926. cur = build_norm(inpL,
  10927. model.output_norm,
  10928. model.output_norm_b,
  10929. LLM_NORM, -1);
  10930. cb(cur, "result_norm", -1);
  10931. res->t_embd = cur;
  10932. cur = build_lora_mm(model.output, cur);
  10933. cb(cur, "result_output", -1);
  10934. res->t_logits = cur;
  10935. ggml_build_forward_expand(gf, cur);
  10936. }
  10937. };
  10938. struct llm_build_chatglm : public llm_graph_context {
  10939. llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10940. const int64_t n_embd_head = hparams.n_embd_head_v;
  10941. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  10942. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10943. ggml_tensor * cur;
  10944. ggml_tensor * inpL;
  10945. inpL = build_inp_embd(model.tok_embd);
  10946. // inp_pos - contains the positions
  10947. ggml_tensor * inp_pos = build_inp_pos();
  10948. auto * inp_attn = build_attn_inp_kv();
  10949. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10950. for (int il = 0; il < n_layer; ++il) {
  10951. ggml_tensor * inpSA = inpL;
  10952. cur = build_norm(inpL,
  10953. model.layers[il].attn_norm,
  10954. NULL,
  10955. LLM_NORM_RMS, il);
  10956. cb(cur, "attn_norm", il);
  10957. // self-attention
  10958. {
  10959. ggml_tensor * Qcur = nullptr;
  10960. ggml_tensor * Kcur = nullptr;
  10961. ggml_tensor * Vcur = nullptr;
  10962. if (model.layers[il].wqkv == nullptr) {
  10963. Qcur = build_lora_mm(model.layers[il].wq, cur);
  10964. if (model.layers[il].bq) {
  10965. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  10966. }
  10967. Kcur = build_lora_mm(model.layers[il].wk, cur);
  10968. if (model.layers[il].bk) {
  10969. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  10970. }
  10971. Vcur = build_lora_mm(model.layers[il].wv, cur);
  10972. if (model.layers[il].bv) {
  10973. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  10974. }
  10975. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10976. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10977. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  10978. } else {
  10979. cur = build_lora_mm(model.layers[il].wqkv, cur);
  10980. cb(cur, "wqkv", il);
  10981. if (model.layers[il].bqkv) {
  10982. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  10983. cb(cur, "bqkv", il);
  10984. }
  10985. Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  10986. Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  10987. Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  10988. }
  10989. //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
  10990. Qcur = ggml_rope_ext(
  10991. ctx0, Qcur, inp_pos, nullptr,
  10992. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10993. ext_factor, attn_factor, beta_fast, beta_slow
  10994. );
  10995. Kcur = ggml_rope_ext(
  10996. ctx0, Kcur, inp_pos, nullptr,
  10997. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10998. ext_factor, attn_factor, beta_fast, beta_slow
  10999. );
  11000. cb(Qcur, "Qcur", il);
  11001. cb(Kcur, "Kcur", il);
  11002. cb(Vcur, "Vcur", il);
  11003. cur = build_attn(inp_attn,
  11004. model.layers[il].wo, NULL,
  11005. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  11006. }
  11007. if (il == n_layer - 1 && inp_out_ids) {
  11008. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11009. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  11010. }
  11011. // Add the input
  11012. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  11013. cb(ffn_inp, "ffn_inp", il);
  11014. // FF
  11015. {
  11016. cur = build_norm(ffn_inp,
  11017. model.layers[il].ffn_norm,
  11018. NULL,
  11019. LLM_NORM_RMS, il);
  11020. cb(cur, "ffn_norm", il);
  11021. cur = build_ffn(cur,
  11022. model.layers[il].ffn_up, NULL, NULL,
  11023. NULL, NULL, NULL,
  11024. model.layers[il].ffn_down, NULL, NULL,
  11025. NULL,
  11026. LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
  11027. cb(cur, "ffn_out", il);
  11028. }
  11029. inpL = ggml_add(ctx0, cur, ffn_inp);
  11030. cb(inpL, "l_out", il);
  11031. }
  11032. cur = build_norm(inpL,
  11033. model.output_norm,
  11034. NULL,
  11035. LLM_NORM_RMS, -1);
  11036. cb(cur, "result_norm", -1);
  11037. res->t_embd = cur;
  11038. cur = build_lora_mm(model.output, cur);
  11039. cb(cur, "result_output", -1);
  11040. res->t_logits = cur;
  11041. ggml_build_forward_expand(gf, cur);
  11042. }
  11043. };
  11044. struct llm_build_glm4 : public llm_graph_context {
  11045. llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  11046. const int64_t n_embd_head = hparams.n_embd_head_v;
  11047. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  11048. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  11049. ggml_tensor * cur;
  11050. ggml_tensor * inpL;
  11051. inpL = build_inp_embd(model.tok_embd);
  11052. // inp_pos - contains the positions
  11053. ggml_tensor * inp_pos = build_inp_pos();
  11054. auto * inp_attn = build_attn_inp_kv();
  11055. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11056. for (int il = 0; il < n_layer; ++il) {
  11057. ggml_tensor * inpSA = inpL;
  11058. // Pre-attention norm
  11059. cur = build_norm(inpL,
  11060. model.layers[il].attn_norm,
  11061. NULL,
  11062. LLM_NORM_RMS, il);
  11063. cb(cur, "attn_norm", il);
  11064. // self-attention
  11065. {
  11066. ggml_tensor * Qcur = nullptr;
  11067. ggml_tensor * Kcur = nullptr;
  11068. ggml_tensor * Vcur = nullptr;
  11069. if (model.layers[il].wqkv == nullptr) {
  11070. Qcur = build_lora_mm(model.layers[il].wq, cur);
  11071. if (model.layers[il].bq) {
  11072. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  11073. }
  11074. Kcur = build_lora_mm(model.layers[il].wk, cur);
  11075. if (model.layers[il].bk) {
  11076. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  11077. }
  11078. Vcur = build_lora_mm(model.layers[il].wv, cur);
  11079. if (model.layers[il].bv) {
  11080. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  11081. }
  11082. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  11083. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  11084. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  11085. } else {
  11086. cur = build_lora_mm(model.layers[il].wqkv, cur);
  11087. cb(cur, "wqkv", il);
  11088. if (model.layers[il].bqkv) {
  11089. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  11090. cb(cur, "bqkv", il);
  11091. }
  11092. Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  11093. Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  11094. Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
  11095. }
  11096. Qcur = ggml_rope_ext(
  11097. ctx0, Qcur, inp_pos, nullptr,
  11098. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11099. ext_factor, attn_factor, beta_fast, beta_slow
  11100. );
  11101. Kcur = ggml_rope_ext(
  11102. ctx0, Kcur, inp_pos, nullptr,
  11103. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11104. ext_factor, attn_factor, beta_fast, beta_slow
  11105. );
  11106. cb(Qcur, "Qcur", il);
  11107. cb(Kcur, "Kcur", il);
  11108. cb(Vcur, "Vcur", il);
  11109. cur = build_attn(inp_attn,
  11110. model.layers[il].wo, NULL,
  11111. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  11112. }
  11113. if (il == n_layer - 1 && inp_out_ids) {
  11114. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11115. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  11116. }
  11117. // Post-attention norm (new!)
  11118. cur = build_norm(cur,
  11119. model.layers[il].attn_post_norm,
  11120. NULL,
  11121. LLM_NORM_RMS, il);
  11122. cb(cur, "post_attn_norm", il);
  11123. // Add the input (residual connection after post-attention norm)
  11124. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  11125. cb(ffn_inp, "ffn_inp", il);
  11126. // FF
  11127. {
  11128. // Pre-MLP norm
  11129. cur = build_norm(ffn_inp,
  11130. model.layers[il].ffn_norm,
  11131. NULL,
  11132. LLM_NORM_RMS, il);
  11133. cb(cur, "ffn_norm", il);
  11134. // MLP
  11135. cur = build_ffn(cur,
  11136. model.layers[il].ffn_up, NULL, NULL,
  11137. NULL, NULL, NULL,
  11138. model.layers[il].ffn_down, NULL, NULL,
  11139. NULL,
  11140. LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
  11141. cb(cur, "ffn_out", il);
  11142. // Post-MLP norm
  11143. cur = build_norm(cur,
  11144. model.layers[il].ffn_post_norm,
  11145. NULL,
  11146. LLM_NORM_RMS, il);
  11147. cb(cur, "post_mlp_norm", il);
  11148. }
  11149. // Add residual connection after post-MLP norm
  11150. inpL = ggml_add(ctx0, cur, ffn_inp);
  11151. cb(inpL, "l_out", il);
  11152. }
  11153. // Final norm
  11154. cur = build_norm(inpL,
  11155. model.output_norm,
  11156. NULL,
  11157. LLM_NORM_RMS, -1);
  11158. cb(cur, "result_norm", -1);
  11159. res->t_embd = cur;
  11160. // Output projection
  11161. cur = build_lora_mm(model.output, cur);
  11162. cb(cur, "result_output", -1);
  11163. res->t_logits = cur;
  11164. ggml_build_forward_expand(gf, cur);
  11165. }
  11166. };
  11167. struct llm_build_glm4_moe : public llm_graph_context {
  11168. llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  11169. const int64_t n_embd_head = hparams.n_embd_head_v;
  11170. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  11171. ggml_tensor * cur;
  11172. ggml_tensor * inpL;
  11173. inpL = build_inp_embd(model.tok_embd);
  11174. // inp_pos - contains the positions
  11175. ggml_tensor * inp_pos = build_inp_pos();
  11176. auto * inp_attn = build_attn_inp_kv();
  11177. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11178. // Only process up to last layer (skip final NextN layer)
  11179. // Final layer tensors are loaded but not processed in forward pass
  11180. const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
  11181. for (int il = 0; il < n_transformer_layers; ++il) {
  11182. ggml_tensor * inpSA = inpL;
  11183. // Pre-attention norm
  11184. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  11185. cb(cur, "attn_norm", il);
  11186. // self-attention
  11187. {
  11188. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  11189. if (model.layers[il].bq) {
  11190. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  11191. }
  11192. cb(Qcur, "Qcur", il);
  11193. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  11194. if (model.layers[il].bk) {
  11195. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  11196. }
  11197. cb(Kcur, "Kcur", il);
  11198. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  11199. if (model.layers[il].bv) {
  11200. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  11201. }
  11202. cb(Vcur, "Vcur", il);
  11203. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  11204. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  11205. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  11206. // Apply Q/K norm if available (GLM-4.5 355B variant)
  11207. if (model.layers[il].attn_q_norm) {
  11208. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  11209. cb(Qcur, "Qcur_normed", il);
  11210. }
  11211. if (model.layers[il].attn_k_norm) {
  11212. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  11213. cb(Kcur, "Kcur_normed", il);
  11214. }
  11215. Qcur = ggml_rope_ext(
  11216. ctx0, Qcur, inp_pos, nullptr,
  11217. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11218. ext_factor, attn_factor, beta_fast, beta_slow
  11219. );
  11220. Kcur = ggml_rope_ext(
  11221. ctx0, Kcur, inp_pos, nullptr,
  11222. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11223. ext_factor, attn_factor, beta_fast, beta_slow
  11224. );
  11225. cb(Qcur, "Qcur", il);
  11226. cb(Kcur, "Kcur", il);
  11227. cb(Vcur, "Vcur", il);
  11228. cur = build_attn(inp_attn,
  11229. model.layers[il].wo, NULL,
  11230. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  11231. }
  11232. if (il == n_transformer_layers - 1 && inp_out_ids) {
  11233. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11234. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  11235. }
  11236. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  11237. cb(ffn_inp, "ffn_inp", il);
  11238. // Post-attention norm
  11239. cur = build_norm(ffn_inp, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
  11240. cb(cur, "post_attn_norm", il);
  11241. // Check if this is a dense layer (n_layer_dense_lead=1, so layer 0 is dense)
  11242. if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
  11243. // Dense FFN layer
  11244. cur = build_ffn(cur,
  11245. model.layers[il].ffn_up, NULL, NULL,
  11246. model.layers[il].ffn_gate, NULL, NULL,
  11247. model.layers[il].ffn_down, NULL, NULL,
  11248. NULL,
  11249. LLM_FFN_SILU, LLM_FFN_PAR, il);
  11250. cb(cur, "ffn_out", il);
  11251. } else {
  11252. // Process routed experts using existing MoE infrastructure
  11253. ggml_tensor * routed_out = build_moe_ffn(cur,
  11254. model.layers[il].ffn_gate_inp,
  11255. model.layers[il].ffn_up_exps,
  11256. model.layers[il].ffn_gate_exps,
  11257. model.layers[il].ffn_down_exps,
  11258. model.layers[il].ffn_exp_probs_b,
  11259. n_expert, n_expert_used,
  11260. LLM_FFN_SILU, hparams.expert_weights_norm,
  11261. true, hparams.expert_weights_scale,
  11262. (llama_expert_gating_func_type) hparams.expert_gating_func,
  11263. il);
  11264. cb(routed_out, "ffn_moe_out", il);
  11265. // Process shared expert on original input
  11266. ggml_tensor * shared_out = build_ffn(cur,
  11267. model.layers[il].ffn_up_shexp, NULL, NULL,
  11268. model.layers[il].ffn_gate_shexp, NULL, NULL,
  11269. model.layers[il].ffn_down_shexp, NULL, NULL,
  11270. NULL,
  11271. LLM_FFN_SILU, LLM_FFN_PAR, il);
  11272. cb(shared_out, "ffn_shexp_out", il);
  11273. // Final output: routed_output + shared_output
  11274. cur = ggml_add(ctx0, routed_out, shared_out);
  11275. cb(cur, "ffn_out", il);
  11276. }
  11277. cur = ggml_add(ctx0, cur, ffn_inp);
  11278. cur = build_cvec(cur, il);
  11279. cb(cur, "l_out", il);
  11280. // input for next layer
  11281. inpL = cur;
  11282. }
  11283. cur = inpL;
  11284. cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
  11285. cb(cur, "result_norm", -1);
  11286. res->t_embd = cur;
  11287. // lm_head
  11288. cur = build_lora_mm(model.output, cur);
  11289. cb(cur, "result_output", -1);
  11290. res->t_logits = cur;
  11291. ggml_build_forward_expand(gf, cur);
  11292. }
  11293. };
  11294. struct llm_build_nemotron : public llm_graph_context {
  11295. llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  11296. const int64_t n_embd_head = hparams.n_embd_head_v;
  11297. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  11298. //GGML_ASSERT(n_embd_head == hparams.n_rot);
  11299. ggml_tensor * cur;
  11300. ggml_tensor * inpL;
  11301. inpL = build_inp_embd(model.tok_embd);
  11302. // inp_pos - contains the positions
  11303. ggml_tensor * inp_pos = build_inp_pos();
  11304. auto * inp_attn = build_attn_inp_kv();
  11305. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11306. for (int il = 0; il < n_layer; ++il) {
  11307. ggml_tensor * inpSA = inpL;
  11308. // norm
  11309. cur = build_norm(inpL,
  11310. model.layers[il].attn_norm,
  11311. model.layers[il].attn_norm_b,
  11312. LLM_NORM, il);
  11313. cb(cur, "attn_norm", il);
  11314. // self-attention
  11315. {
  11316. // compute Q and K and RoPE them
  11317. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  11318. cb(Qcur, "Qcur", il);
  11319. if (model.layers[il].bq) {
  11320. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  11321. cb(Qcur, "Qcur", il);
  11322. }
  11323. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  11324. cb(Kcur, "Kcur", il);
  11325. if (model.layers[il].bk) {
  11326. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  11327. cb(Kcur, "Kcur", il);
  11328. }
  11329. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  11330. cb(Vcur, "Vcur", il);
  11331. if (model.layers[il].bv) {
  11332. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  11333. cb(Vcur, "Vcur", il);
  11334. }
  11335. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  11336. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  11337. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  11338. Qcur = ggml_rope_ext(
  11339. ctx0, Qcur, inp_pos, nullptr,
  11340. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11341. ext_factor, attn_factor, beta_fast, beta_slow
  11342. );
  11343. Kcur = ggml_rope_ext(
  11344. ctx0, Kcur, inp_pos, nullptr,
  11345. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11346. ext_factor, attn_factor, beta_fast, beta_slow
  11347. );
  11348. cb(Qcur, "Qcur", il);
  11349. cb(Kcur, "Kcur", il);
  11350. cb(Vcur, "Vcur", il);
  11351. cur = build_attn(inp_attn,
  11352. model.layers[il].wo, model.layers[il].bo,
  11353. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  11354. }
  11355. if (il == n_layer - 1 && inp_out_ids) {
  11356. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11357. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  11358. }
  11359. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  11360. cb(ffn_inp, "ffn_inp", il);
  11361. // feed-forward network
  11362. cur = build_norm(ffn_inp,
  11363. model.layers[il].ffn_norm,
  11364. model.layers[il].ffn_norm_b,
  11365. LLM_NORM, il);
  11366. cb(cur, "ffn_norm", il);
  11367. cur = build_ffn(cur,
  11368. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  11369. NULL, NULL, NULL,
  11370. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  11371. NULL,
  11372. LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
  11373. cur = ggml_add(ctx0, cur, ffn_inp);
  11374. cb(cur, "ffn_out", il);
  11375. cur = build_cvec(cur, il);
  11376. cb(cur, "l_out", il);
  11377. // input for next layer
  11378. inpL = cur;
  11379. }
  11380. cur = inpL;
  11381. cur = build_norm(cur,
  11382. model.output_norm, model.output_norm_b,
  11383. LLM_NORM, -1);
  11384. cb(cur, "result_norm", -1);
  11385. res->t_embd = cur;
  11386. // lm_head
  11387. cur = build_lora_mm(model.output, cur);
  11388. cb(cur, "result_output", -1);
  11389. res->t_logits = cur;
  11390. ggml_build_forward_expand(gf, cur);
  11391. }
  11392. };
  11393. struct llm_build_nemotron_h : public llm_graph_context_mamba {
  11394. llm_build_nemotron_h(
  11395. const llama_model & model,
  11396. const llm_graph_params & params) :
  11397. llm_graph_context_mamba(params) {
  11398. const int64_t n_embd_head = hparams.n_embd_head_v;
  11399. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  11400. ggml_tensor * cur;
  11401. ggml_tensor * inpL;
  11402. inpL = build_inp_embd(model.tok_embd);
  11403. auto * inp = build_inp_mem_hybrid();
  11404. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11405. for (int il = 0; il < n_layer; ++il) {
  11406. struct ggml_tensor * inpSA = inpL;
  11407. // norm
  11408. cur = build_norm(inpL,
  11409. model.layers[il].attn_norm, NULL,
  11410. LLM_NORM_RMS, il);
  11411. cb(cur, "attn_norm", il);
  11412. if (hparams.is_recurrent(il)) {
  11413. // ssm layer //
  11414. cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
  11415. } else if (hparams.n_ff(il) == 0) {
  11416. // attention layer //
  11417. cur = build_attention_layer(cur, inp->get_attn(), model, n_embd_head, il);
  11418. } else {
  11419. cur = build_ffn_layer(cur, model, il);
  11420. }
  11421. if (il == n_layer - 1 && inp_out_ids) {
  11422. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11423. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  11424. }
  11425. // add residual
  11426. cur = ggml_add(ctx0, cur, inpSA);
  11427. cb(cur, "block_out", il);
  11428. // input for next layer
  11429. inpL = cur;
  11430. }
  11431. cur = inpL;
  11432. cur = build_norm(cur,
  11433. model.output_norm, NULL,
  11434. LLM_NORM_RMS, -1);
  11435. cb(cur, "result_norm", -1);
  11436. res->t_embd = cur;
  11437. // lm_head
  11438. cur = build_lora_mm(model.output, cur);
  11439. cb(cur, "result_output", -1);
  11440. res->t_logits = cur;
  11441. ggml_build_forward_expand(gf, cur);
  11442. }
  11443. ggml_tensor * build_attention_layer(
  11444. ggml_tensor * cur,
  11445. llm_graph_input_attn_kv * inp_attn,
  11446. const llama_model & model,
  11447. const int64_t n_embd_head,
  11448. const int il) {
  11449. // compute Q and K and (optionally) RoPE them
  11450. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  11451. cb(Qcur, "Qcur", il);
  11452. if (model.layers[il].bq) {
  11453. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  11454. cb(Qcur, "Qcur", il);
  11455. }
  11456. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  11457. cb(Kcur, "Kcur", il);
  11458. if (model.layers[il].bk) {
  11459. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  11460. cb(Kcur, "Kcur", il);
  11461. }
  11462. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  11463. cb(Vcur, "Vcur", il);
  11464. if (model.layers[il].bv) {
  11465. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  11466. cb(Vcur, "Vcur", il);
  11467. }
  11468. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
  11469. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
  11470. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
  11471. cb(Qcur, "Qcur", il);
  11472. cb(Kcur, "Kcur", il);
  11473. cb(Vcur, "Vcur", il);
  11474. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  11475. cur = build_attn(inp_attn,
  11476. model.layers[il].wo, model.layers[il].bo,
  11477. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  11478. cb(cur, "attn_out", il);
  11479. return cur;
  11480. }
  11481. ggml_tensor * build_ffn_layer(
  11482. ggml_tensor * cur,
  11483. const llama_model & model,
  11484. const int il) {
  11485. cur = build_ffn(cur,
  11486. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  11487. NULL, NULL, NULL,
  11488. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  11489. NULL,
  11490. LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
  11491. cb(cur, "ffn_out", il);
  11492. cur = build_cvec(cur, il);
  11493. cb(cur, "l_out", il);
  11494. return cur;
  11495. }
  11496. };
  11497. struct llm_build_exaone : public llm_graph_context {
  11498. llm_build_exaone(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  11499. const int64_t n_embd_head = hparams.n_embd_head_v;
  11500. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  11501. GGML_ASSERT(n_embd_head == hparams.n_rot);
  11502. ggml_tensor * cur;
  11503. ggml_tensor * inpL;
  11504. inpL = build_inp_embd(model.tok_embd);
  11505. // inp_pos - contains the positions
  11506. ggml_tensor * inp_pos = build_inp_pos();
  11507. auto * inp_attn = build_attn_inp_kv();
  11508. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11509. for (int il = 0; il < n_layer; ++il) {
  11510. ggml_tensor * inpSA = inpL;
  11511. // norm
  11512. cur = build_norm(inpL,
  11513. model.layers[il].attn_norm, NULL,
  11514. LLM_NORM_RMS, il);
  11515. cb(cur, "attn_norm", il);
  11516. // self-attention
  11517. {
  11518. // rope freq factors for llama3; may return nullptr for llama2 and other models
  11519. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  11520. // compute Q and K and RoPE them
  11521. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  11522. cb(Qcur, "Qcur", il);
  11523. if (model.layers[il].bq) {
  11524. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  11525. cb(Qcur, "Qcur", il);
  11526. }
  11527. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  11528. cb(Kcur, "Kcur", il);
  11529. if (model.layers[il].bk) {
  11530. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  11531. cb(Kcur, "Kcur", il);
  11532. }
  11533. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  11534. cb(Vcur, "Vcur", il);
  11535. if (model.layers[il].bv) {
  11536. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  11537. cb(Vcur, "Vcur", il);
  11538. }
  11539. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  11540. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  11541. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  11542. Qcur = ggml_rope_ext(
  11543. ctx0, Qcur, inp_pos, rope_factors,
  11544. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11545. ext_factor, attn_factor, beta_fast, beta_slow
  11546. );
  11547. Kcur = ggml_rope_ext(
  11548. ctx0, Kcur, inp_pos, rope_factors,
  11549. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11550. ext_factor, attn_factor, beta_fast, beta_slow
  11551. );
  11552. cb(Qcur, "Qcur", il);
  11553. cb(Kcur, "Kcur", il);
  11554. cb(Vcur, "Vcur", il);
  11555. cur = build_attn(inp_attn,
  11556. model.layers[il].wo, model.layers[il].bo,
  11557. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  11558. }
  11559. if (il == n_layer - 1 && inp_out_ids) {
  11560. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11561. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  11562. }
  11563. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  11564. cb(ffn_inp, "ffn_inp", il);
  11565. // feed-forward network
  11566. cur = build_norm(ffn_inp,
  11567. model.layers[il].ffn_norm, NULL,
  11568. LLM_NORM_RMS, il);
  11569. cb(cur, "ffn_norm", il);
  11570. cur = build_ffn(cur,
  11571. model.layers[il].ffn_up, NULL, NULL,
  11572. model.layers[il].ffn_gate, NULL, NULL,
  11573. model.layers[il].ffn_down, NULL, NULL,
  11574. NULL,
  11575. LLM_FFN_SILU, LLM_FFN_PAR, il);
  11576. cb(cur, "ffn_out", il);
  11577. cur = ggml_add(ctx0, cur, ffn_inp);
  11578. cb(cur, "ffn_out", il);
  11579. cur = build_cvec(cur, il);
  11580. cb(cur, "l_out", il);
  11581. // input for next layer
  11582. inpL = cur;
  11583. }
  11584. cur = inpL;
  11585. cur = build_norm(cur,
  11586. model.output_norm, NULL,
  11587. LLM_NORM_RMS, -1);
  11588. cb(cur, "result_norm", -1);
  11589. res->t_embd = cur;
  11590. // lm_head
  11591. cur = build_lora_mm(model.output, cur);
  11592. cb(cur, "result_output", -1);
  11593. res->t_logits = cur;
  11594. ggml_build_forward_expand(gf, cur);
  11595. }
  11596. };
  11597. template <bool iswa>
  11598. struct llm_build_exaone4 : public llm_graph_context {
  11599. llm_build_exaone4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  11600. const int64_t n_embd_head = hparams.n_embd_head_k;
  11601. GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
  11602. GGML_ASSERT(n_embd_head == hparams.n_rot);
  11603. ggml_tensor * cur;
  11604. ggml_tensor * inpL;
  11605. inpL = build_inp_embd(model.tok_embd);
  11606. // inp_pos - contains the positions
  11607. ggml_tensor * inp_pos = build_inp_pos();
  11608. using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
  11609. inp_attn_type * inp_attn = nullptr;
  11610. if constexpr (iswa) {
  11611. inp_attn = build_attn_inp_kv_iswa();
  11612. } else {
  11613. inp_attn = build_attn_inp_kv();
  11614. }
  11615. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11616. for (int il = 0; il < n_layer; ++il) {
  11617. ggml_tensor * inpSA = inpL;
  11618. // use RoPE for SWA layers or non-SWA models
  11619. const bool use_rope = hparams.is_swa(il) || hparams.swa_type == LLAMA_SWA_TYPE_NONE;
  11620. cur = inpL;
  11621. // self-attention
  11622. {
  11623. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  11624. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  11625. cb(Qcur, "Qcur", il);
  11626. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  11627. cb(Kcur, "Kcur", il);
  11628. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  11629. cb(Vcur, "Vcur", il);
  11630. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  11631. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  11632. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  11633. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  11634. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  11635. cb(Qcur, "Qcur_normed", il);
  11636. cb(Kcur, "Kcur_normed", il);
  11637. if (use_rope) {
  11638. Qcur = ggml_rope_ext(
  11639. ctx0, Qcur, inp_pos, rope_factors,
  11640. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11641. ext_factor, attn_factor, beta_fast, beta_slow
  11642. );
  11643. Kcur = ggml_rope_ext(
  11644. ctx0, Kcur, inp_pos, rope_factors,
  11645. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11646. ext_factor, attn_factor, beta_fast, beta_slow
  11647. );
  11648. }
  11649. cb(Qcur, "Qcur", il);
  11650. cb(Kcur, "Kcur", il);
  11651. cb(Vcur, "Vcur", il);
  11652. cur = build_attn(inp_attn,
  11653. model.layers[il].wo, NULL,
  11654. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  11655. cb(cur, "attn_out", il);
  11656. }
  11657. if (il == n_layer - 1 && inp_out_ids) {
  11658. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11659. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  11660. }
  11661. cur = build_norm(cur,
  11662. model.layers[il].attn_post_norm, NULL,
  11663. LLM_NORM_RMS, il);
  11664. cb(cur, "attn_post_norm", il);
  11665. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  11666. cb(ffn_inp, "ffn_inp", il);
  11667. // feed-forward network
  11668. cur = build_ffn(ffn_inp,
  11669. model.layers[il].ffn_up, NULL, NULL,
  11670. model.layers[il].ffn_gate, NULL, NULL,
  11671. model.layers[il].ffn_down, NULL, NULL,
  11672. NULL,
  11673. LLM_FFN_SILU, LLM_FFN_PAR, il);
  11674. cb(cur, "ffn_out", il);
  11675. cur = build_norm(cur,
  11676. model.layers[il].ffn_post_norm, NULL,
  11677. LLM_NORM_RMS, -1);
  11678. cb(cur, "ffn_post_norm", -1);
  11679. cur = ggml_add(ctx0, cur, ffn_inp);
  11680. cur = build_cvec(cur, il);
  11681. cb(cur, "l_out", il);
  11682. // input for next layer
  11683. inpL = cur;
  11684. }
  11685. cur = inpL;
  11686. cur = build_norm(cur,
  11687. model.output_norm, NULL,
  11688. LLM_NORM_RMS, -1);
  11689. cb(cur, "result_norm", -1);
  11690. res->t_embd = cur;
  11691. // lm_head
  11692. cur = build_lora_mm(model.output, cur);
  11693. cb(cur, "result_output", -1);
  11694. res->t_logits = cur;
  11695. ggml_build_forward_expand(gf, cur);
  11696. }
  11697. };
  11698. struct llm_build_rwkv6_base : public llm_graph_context {
  11699. const llama_model & model;
  11700. llm_build_rwkv6_base(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
  11701. }
  11702. ggml_tensor * build_rwkv6_channel_mix(
  11703. const llama_layer * layer,
  11704. ggml_tensor * cur,
  11705. ggml_tensor * x_prev,
  11706. llm_arch arch) const {
  11707. ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
  11708. switch (arch) {
  11709. case LLM_ARCH_RWKV6:
  11710. {
  11711. ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
  11712. ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur);
  11713. ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr));
  11714. ggml_tensor * k = ggml_sqr(
  11715. ctx0,
  11716. ggml_relu(
  11717. ctx0,
  11718. build_lora_mm(layer->channel_mix_key, xk)
  11719. )
  11720. );
  11721. cur = ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k));
  11722. } break;
  11723. default:
  11724. GGML_ABORT("fatal error");
  11725. }
  11726. return cur;
  11727. }
  11728. ggml_tensor * build_rwkv6_time_mix(
  11729. llm_graph_input_rs * inp,
  11730. ggml_tensor * cur,
  11731. ggml_tensor * x_prev,
  11732. const llama_ubatch & ubatch,
  11733. int il) const {
  11734. const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
  11735. const auto n_tokens = ubatch.n_tokens;
  11736. const auto n_seqs = ubatch.n_seqs;
  11737. const auto n_seq_tokens = ubatch.n_seq_tokens;
  11738. const auto n_embd = hparams.n_embd;
  11739. const auto head_size = hparams.wkv_head_size;
  11740. const auto n_head = n_embd / head_size;
  11741. const auto n_head_kv = hparams.n_head_kv(il);
  11742. const auto kv_head = mctx_cur->get_head();
  11743. const auto & layer = model.layers[il];
  11744. bool is_qrwkv = layer.time_mix_first == nullptr;
  11745. ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
  11746. sx = ggml_reshape_2d(ctx0, sx, n_embd, n_tokens);
  11747. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  11748. ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_x), cur);
  11749. xxx = ggml_reshape_4d(
  11750. ctx0,
  11751. ggml_tanh(
  11752. ctx0,
  11753. ggml_mul_mat(ctx0, layer.time_mix_w1, xxx)
  11754. ),
  11755. layer.time_mix_w1->ne[1] / 5, 1, 5, n_tokens
  11756. );
  11757. xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2));
  11758. xxx = ggml_mul_mat(
  11759. ctx0,
  11760. ggml_reshape_4d(
  11761. ctx0,
  11762. layer.time_mix_w2,
  11763. layer.time_mix_w2->ne[0], layer.time_mix_w2->ne[1], 1, 5
  11764. ),
  11765. xxx
  11766. );
  11767. ggml_tensor *xw, *xk, *xv, *xr, *xg;
  11768. if (layer.time_mix_lerp_fused) {
  11769. // fusing these weights makes some performance improvement
  11770. sx = ggml_reshape_3d(ctx0, sx, n_embd, 1, n_tokens);
  11771. cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
  11772. xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer.time_mix_lerp_fused), sx), cur);
  11773. xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
  11774. xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
  11775. xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
  11776. xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
  11777. xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
  11778. } else {
  11779. // for backward compatibility
  11780. xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
  11781. xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
  11782. xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
  11783. xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
  11784. xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
  11785. xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer.time_mix_lerp_w), sx), cur);
  11786. xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer.time_mix_lerp_k), sx), cur);
  11787. xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer.time_mix_lerp_v), sx), cur);
  11788. xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer.time_mix_lerp_r), sx), cur);
  11789. xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer.time_mix_lerp_g), sx), cur);
  11790. }
  11791. ggml_tensor * r = build_lora_mm(layer.time_mix_receptance, xr);
  11792. ggml_tensor * k = build_lora_mm(layer.time_mix_key, xk);
  11793. ggml_tensor * v = build_lora_mm(layer.time_mix_value, xv);
  11794. if (layer.time_mix_receptance_b) {
  11795. r = ggml_add(ctx0, r, layer.time_mix_receptance_b);
  11796. }
  11797. if (layer.time_mix_key_b) {
  11798. k = ggml_add(ctx0, k, layer.time_mix_key_b);
  11799. }
  11800. if (layer.time_mix_value_b) {
  11801. v = ggml_add(ctx0, v, layer.time_mix_value_b);
  11802. }
  11803. ggml_tensor * g = build_lora_mm(layer.time_mix_gate, xg);
  11804. if (is_qrwkv) {
  11805. g = ggml_sigmoid(ctx0, g);
  11806. } else {
  11807. g = ggml_silu(ctx0, g);
  11808. }
  11809. if (n_head_kv != 0 && n_head_kv != n_head) {
  11810. GGML_ASSERT(n_head % n_head_kv == 0);
  11811. k = ggml_reshape_4d(ctx0, k, head_size, 1, n_head_kv, n_tokens);
  11812. v = ggml_reshape_4d(ctx0, v, head_size, 1, n_head_kv, n_tokens);
  11813. ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, n_head / n_head_kv, n_head_kv, n_tokens);
  11814. k = ggml_repeat(ctx0, k, tmp);
  11815. v = ggml_repeat(ctx0, v, tmp);
  11816. }
  11817. k = ggml_reshape_3d(ctx0, k, head_size, n_head, n_tokens);
  11818. v = ggml_reshape_3d(ctx0, v, head_size, n_head, n_tokens);
  11819. r = ggml_reshape_3d(ctx0, r, head_size, n_head, n_tokens);
  11820. ggml_tensor * w = ggml_mul_mat(
  11821. ctx0,
  11822. layer.time_mix_decay_w2,
  11823. ggml_tanh(
  11824. ctx0,
  11825. ggml_mul_mat(ctx0, layer.time_mix_decay_w1, xw)
  11826. )
  11827. );
  11828. w = ggml_add(ctx0, w, layer.time_mix_decay);
  11829. w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w)));
  11830. w = ggml_reshape_3d(ctx0, w, head_size, n_head, n_tokens);
  11831. if (is_qrwkv) {
  11832. // k = k * (1 - w)
  11833. k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
  11834. }
  11835. ggml_tensor * wkv_state = build_rs(
  11836. inp, mctx_cur->get_s_l(il),
  11837. hparams.n_embd_s(), n_seqs);
  11838. ggml_tensor * wkv_output;
  11839. if (is_qrwkv) {
  11840. wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, wkv_state, pow(head_size, -0.5f));
  11841. } else {
  11842. wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer.time_mix_first, w, wkv_state);
  11843. }
  11844. cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
  11845. wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
  11846. ggml_build_forward_expand(
  11847. gf,
  11848. ggml_cpy(
  11849. ctx0,
  11850. wkv_state,
  11851. ggml_view_1d(
  11852. ctx0,
  11853. mctx_cur->get_s_l(il),
  11854. hparams.n_embd_s() * n_seqs,
  11855. hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
  11856. )
  11857. )
  11858. );
  11859. if (!is_qrwkv) {
  11860. // group norm with head_count groups
  11861. cur = ggml_reshape_3d(ctx0, cur, n_embd / n_head, n_head, n_tokens);
  11862. cur = ggml_norm(ctx0, cur, 64e-5f);
  11863. // Convert back to regular vectors.
  11864. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  11865. cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.time_mix_ln), layer.time_mix_ln_b);
  11866. } else {
  11867. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  11868. }
  11869. cur = ggml_mul(ctx0, cur, g);
  11870. cur = build_lora_mm(layer.time_mix_output, cur);
  11871. return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
  11872. }
  11873. };
  11874. struct llm_build_rwkv6 : public llm_build_rwkv6_base {
  11875. llm_build_rwkv6(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) {
  11876. GGML_ASSERT(hparams.token_shift_count == 2);
  11877. ggml_tensor * cur;
  11878. ggml_tensor * inpL;
  11879. inpL = build_inp_embd(model.tok_embd);
  11880. inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
  11881. auto * rs_inp = build_rs_inp();
  11882. const auto n_embd = hparams.n_embd;
  11883. const auto n_seq_tokens = ubatch.n_seq_tokens;
  11884. const auto n_seqs = ubatch.n_seqs;
  11885. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11886. for (int il = 0; il < n_layer; ++il) {
  11887. const llama_layer * layer = &model.layers[il];
  11888. inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
  11889. ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
  11890. ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
  11891. ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
  11892. ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
  11893. cb(att_norm, "attn_norm", il);
  11894. ggml_tensor * x_prev = ggml_concat(
  11895. ctx0,
  11896. att_shift,
  11897. ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
  11898. 1
  11899. );
  11900. cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il);
  11901. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  11902. cb(ffn_inp, "ffn_inp", il);
  11903. ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
  11904. cb(ffn_norm, "ffn_norm", il);
  11905. x_prev = ggml_concat(
  11906. ctx0,
  11907. ffn_shift,
  11908. ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0),
  11909. 1
  11910. );
  11911. token_shift = ggml_concat(ctx0,
  11912. ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)),
  11913. ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
  11914. 1
  11915. );
  11916. ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
  11917. ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
  11918. ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
  11919. x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
  11920. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  11921. if (il == n_layer - 1 && inp_out_ids) {
  11922. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  11923. ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
  11924. x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
  11925. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11926. }
  11927. cur = build_rwkv6_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6);
  11928. cur = ggml_add(ctx0, cur, ffn_inp);
  11929. if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
  11930. cur = ggml_scale(ctx0, cur, 0.5F);
  11931. }
  11932. cur = build_cvec(cur, il);
  11933. cb(cur, "l_out", il);
  11934. // input for next layer
  11935. inpL = cur;
  11936. }
  11937. cur = inpL;
  11938. cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
  11939. cb(cur, "result_norm", -1);
  11940. res->t_embd = cur;
  11941. cur = build_lora_mm(model.output, cur);
  11942. cb(cur, "result_output", -1);
  11943. res->t_logits = cur;
  11944. ggml_build_forward_expand(gf, cur);
  11945. }
  11946. };
  11947. // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
  11948. struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
  11949. llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) {
  11950. GGML_ASSERT(n_embd == hparams.n_embd_r());
  11951. ggml_tensor * cur;
  11952. ggml_tensor * inpL;
  11953. inpL = build_inp_embd(model.tok_embd);
  11954. auto * rs_inp = build_rs_inp();
  11955. const auto n_embd = hparams.n_embd;
  11956. const auto n_seq_tokens = ubatch.n_seq_tokens;
  11957. const auto n_seqs = ubatch.n_seqs;
  11958. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11959. for (int il = 0; il < n_layer; ++il) {
  11960. const llama_layer * layer = &model.layers[il];
  11961. inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
  11962. ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
  11963. ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
  11964. cb(att_norm, "attn_norm", il);
  11965. ggml_tensor * x_prev = ggml_concat(
  11966. ctx0,
  11967. token_shift,
  11968. ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
  11969. 1
  11970. );
  11971. cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il);
  11972. token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
  11973. ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
  11974. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  11975. cb(ffn_inp, "ffn_inp", il);
  11976. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  11977. ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
  11978. if (il == n_layer - 1 && inp_out_ids) {
  11979. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11980. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  11981. }
  11982. // feed-forward network
  11983. cur = build_norm(ffn_inp,
  11984. model.layers[il].ffn_norm, NULL,
  11985. LLM_NORM_RMS, il);
  11986. cb(cur, "ffn_norm", il);
  11987. cur = build_ffn(cur,
  11988. model.layers[il].ffn_up, NULL, NULL,
  11989. model.layers[il].ffn_gate, NULL, NULL,
  11990. model.layers[il].ffn_down, NULL, NULL,
  11991. NULL,
  11992. LLM_FFN_SILU, LLM_FFN_PAR, il);
  11993. cb(cur, "ffn_out", il);
  11994. cur = ggml_add(ctx0, cur, ffn_inp);
  11995. cur = build_cvec(cur, il);
  11996. cb(cur, "l_out", il);
  11997. // input for next layer
  11998. inpL = cur;
  11999. }
  12000. cur = inpL;
  12001. cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
  12002. cb(cur, "result_norm", -1);
  12003. res->t_embd = cur;
  12004. cur = build_lora_mm(model.output, cur);
  12005. cb(cur, "result_output", -1);
  12006. res->t_logits = cur;
  12007. ggml_build_forward_expand(gf, cur);
  12008. }
  12009. };
  12010. struct llm_build_rwkv7_base : public llm_graph_context {
  12011. const llama_model & model;
  12012. llm_build_rwkv7_base(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
  12013. }
  12014. ggml_tensor * build_rwkv7_channel_mix(
  12015. const llama_layer * layer,
  12016. ggml_tensor * cur,
  12017. ggml_tensor * x_prev,
  12018. llm_arch arch) const {
  12019. ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
  12020. switch (arch) {
  12021. case LLM_ARCH_RWKV7:
  12022. {
  12023. ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
  12024. ggml_tensor * k = ggml_sqr(
  12025. ctx0,
  12026. ggml_relu(
  12027. ctx0,
  12028. build_lora_mm(layer->channel_mix_key, xk)
  12029. )
  12030. );
  12031. cur = build_lora_mm(layer->channel_mix_value, k);
  12032. } break;
  12033. default:
  12034. GGML_ABORT("fatal error");
  12035. }
  12036. return cur;
  12037. }
  12038. ggml_tensor * build_rwkv7_time_mix(
  12039. llm_graph_input_rs * inp,
  12040. ggml_tensor * cur,
  12041. ggml_tensor * x_prev,
  12042. ggml_tensor *& first_layer_value,
  12043. const llama_ubatch & ubatch,
  12044. int il) const {
  12045. const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
  12046. const auto n_tokens = ubatch.n_tokens;
  12047. const auto n_seqs = ubatch.n_seqs;
  12048. const auto n_embd = hparams.n_embd;
  12049. const auto head_size = hparams.wkv_head_size;
  12050. const auto head_count = n_embd / head_size;
  12051. const auto n_seq_tokens = ubatch.n_seq_tokens;
  12052. const auto kv_head = mctx_cur->get_head();
  12053. const auto & layer = model.layers[il];
  12054. bool has_gating = layer.time_mix_g1 && layer.time_mix_g2;
  12055. ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
  12056. ggml_tensor * dummy = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_embd, n_seq_tokens, n_seqs, has_gating ? 6 : 5);
  12057. sx = ggml_repeat(ctx0, sx, dummy);
  12058. ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_fused), cur);
  12059. ggml_tensor * xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
  12060. ggml_tensor * xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
  12061. ggml_tensor * xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
  12062. ggml_tensor * xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
  12063. ggml_tensor * xa = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
  12064. ggml_tensor * xg = has_gating ? ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 5 * sizeof(float)) : nullptr;
  12065. ggml_tensor * r = build_lora_mm(layer.time_mix_receptance, xr);
  12066. ggml_tensor * w = ggml_add(
  12067. ctx0,
  12068. ggml_mul_mat(ctx0, layer.time_mix_w2, ggml_tanh(ctx0, ggml_mul_mat(ctx0, layer.time_mix_w1, xw))),
  12069. layer.time_mix_w0
  12070. );
  12071. w = ggml_exp(ctx0, ggml_scale(ctx0, ggml_sigmoid(ctx0, w), -0.606531));
  12072. ggml_tensor * k = build_lora_mm(layer.time_mix_key, xk);
  12073. ggml_tensor * v = build_lora_mm(layer.time_mix_value, xv);
  12074. if (first_layer_value == nullptr) {
  12075. first_layer_value = v;
  12076. } else {
  12077. // Add the first layer value as a residual connection.
  12078. v = ggml_add(ctx0, v,
  12079. ggml_mul(ctx0,
  12080. ggml_sub(ctx0, first_layer_value, v),
  12081. ggml_sigmoid(ctx0, ggml_add(ctx0,
  12082. ggml_mul_mat(ctx0, layer.time_mix_v2, ggml_mul_mat(ctx0, layer.time_mix_v1, xv)),
  12083. layer.time_mix_v0
  12084. )
  12085. )
  12086. )
  12087. );
  12088. }
  12089. ggml_tensor * g = nullptr;
  12090. if (layer.time_mix_g1 && layer.time_mix_g2) {
  12091. g = ggml_mul_mat(ctx0, layer.time_mix_g2, ggml_sigmoid(ctx0, ggml_mul_mat(ctx0, layer.time_mix_g1, xg)));
  12092. }
  12093. ggml_tensor * a = ggml_sigmoid(ctx0,
  12094. ggml_add(
  12095. ctx0,
  12096. ggml_mul_mat(ctx0, layer.time_mix_a2, ggml_mul_mat(ctx0, layer.time_mix_a1, xa)),
  12097. layer.time_mix_a0
  12098. )
  12099. );
  12100. ggml_tensor * kk = ggml_reshape_3d(ctx0, ggml_mul(ctx0, k, layer.time_mix_k_k), head_size, head_count, n_tokens);
  12101. kk = ggml_l2_norm(ctx0, kk, 1e-12);
  12102. ggml_tensor * ka = ggml_mul(ctx0, k, layer.time_mix_k_a);
  12103. k = ggml_add(ctx0, k, ggml_sub(ctx0, ggml_mul(ctx0, a, ka), ka));
  12104. r = ggml_reshape_3d(ctx0, r, head_size, head_count, n_tokens);
  12105. w = ggml_reshape_3d(ctx0, w, head_size, head_count, n_tokens);
  12106. k = ggml_reshape_3d(ctx0, k, head_size, head_count, n_tokens);
  12107. v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
  12108. a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
  12109. ggml_tensor * wkv_state = build_rs(
  12110. inp, mctx_cur->get_s_l(il),
  12111. hparams.n_embd_s(), n_seqs);
  12112. ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
  12113. cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
  12114. wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
  12115. ggml_build_forward_expand(
  12116. gf,
  12117. ggml_cpy(
  12118. ctx0,
  12119. wkv_state,
  12120. ggml_view_1d(
  12121. ctx0,
  12122. mctx_cur->get_s_l(il),
  12123. hparams.n_embd_s() * n_seqs,
  12124. hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
  12125. )
  12126. )
  12127. );
  12128. if (layer.time_mix_ln && layer.time_mix_ln_b) {
  12129. // group norm with head_count groups
  12130. cur = ggml_reshape_3d(ctx0, cur, n_embd / head_count, head_count, n_tokens);
  12131. cur = ggml_norm(ctx0, cur, 64e-5f);
  12132. // Convert back to regular vectors.
  12133. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  12134. cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.time_mix_ln), layer.time_mix_ln_b);
  12135. } else {
  12136. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  12137. }
  12138. ggml_tensor * rk = ggml_sum_rows(ctx0,
  12139. ggml_mul(ctx0, ggml_mul(ctx0, k, r), ggml_reshape_2d(ctx0, layer.time_mix_r_k, head_size, head_count)));
  12140. cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, ggml_mul(ctx0, v, rk), n_embd, n_tokens));
  12141. if (has_gating) {
  12142. cur = ggml_mul(ctx0, cur, g);
  12143. }
  12144. cur = build_lora_mm(layer.time_mix_output, cur);
  12145. return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
  12146. }
  12147. };
  12148. struct llm_build_rwkv7 : public llm_build_rwkv7_base {
  12149. llm_build_rwkv7(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv7_base(model, params) {
  12150. GGML_ASSERT(hparams.token_shift_count == 2);
  12151. ggml_tensor * cur;
  12152. ggml_tensor * inpL;
  12153. ggml_tensor * v_first = nullptr;
  12154. inpL = build_inp_embd(model.tok_embd);
  12155. inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
  12156. auto * rs_inp = build_rs_inp();
  12157. const auto n_embd = hparams.n_embd;
  12158. const auto n_seq_tokens = ubatch.n_seq_tokens;
  12159. const auto n_seqs = ubatch.n_seqs;
  12160. ggml_tensor * inp_out_ids = build_inp_out_ids();
  12161. for (int il = 0; il < n_layer; ++il) {
  12162. const llama_layer * layer = &model.layers[il];
  12163. inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
  12164. ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
  12165. ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
  12166. ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
  12167. ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
  12168. cb(att_norm, "attn_norm", il);
  12169. ggml_tensor * x_prev = ggml_concat(
  12170. ctx0,
  12171. att_shift,
  12172. ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
  12173. 1
  12174. );
  12175. cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il);
  12176. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  12177. cb(ffn_inp, "ffn_inp", il);
  12178. ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
  12179. cb(ffn_norm, "ffn_norm", il);
  12180. x_prev = ggml_concat(
  12181. ctx0,
  12182. ffn_shift,
  12183. ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0),
  12184. 1
  12185. );
  12186. token_shift = ggml_concat(ctx0,
  12187. ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)),
  12188. ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
  12189. 1
  12190. );
  12191. ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
  12192. ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
  12193. ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
  12194. x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
  12195. if (il == n_layer - 1 && inp_out_ids) {
  12196. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  12197. ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
  12198. x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
  12199. }
  12200. cur = build_rwkv7_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV7);
  12201. cur = ggml_add(ctx0, cur, ffn_inp);
  12202. cur = build_cvec(cur, il);
  12203. cb(cur, "l_out", il);
  12204. // input for next layer
  12205. inpL = cur;
  12206. }
  12207. cur = inpL;
  12208. cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
  12209. cb(cur, "result_norm", -1);
  12210. res->t_embd = cur;
  12211. cur = build_lora_mm(model.output, cur);
  12212. cb(cur, "result_output", -1);
  12213. res->t_logits = cur;
  12214. ggml_build_forward_expand(gf, cur);
  12215. }
  12216. };
  12217. struct llm_build_arwkv7 : public llm_build_rwkv7_base {
  12218. llm_build_arwkv7(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv7_base(model, params) {
  12219. GGML_ASSERT(n_embd == hparams.n_embd_r());
  12220. ggml_tensor * cur;
  12221. ggml_tensor * inpL;
  12222. ggml_tensor * v_first = nullptr;
  12223. inpL = build_inp_embd(model.tok_embd);
  12224. auto * rs_inp = build_rs_inp();
  12225. const auto n_embd = hparams.n_embd;
  12226. const auto n_seq_tokens = ubatch.n_seq_tokens;
  12227. const auto n_seqs = ubatch.n_seqs;
  12228. ggml_tensor * inp_out_ids = build_inp_out_ids();
  12229. for (int il = 0; il < n_layer; ++il) {
  12230. const llama_layer * layer = &model.layers[il];
  12231. inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
  12232. ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
  12233. ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
  12234. cb(att_norm, "attn_norm", il);
  12235. ggml_tensor * x_prev = ggml_concat(
  12236. ctx0,
  12237. token_shift,
  12238. ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
  12239. 1
  12240. );
  12241. cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il);
  12242. token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
  12243. ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
  12244. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  12245. cb(ffn_inp, "ffn_inp", il);
  12246. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  12247. ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
  12248. if (il == n_layer - 1 && inp_out_ids) {
  12249. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12250. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  12251. }
  12252. // feed-forward network
  12253. cur = build_norm(ffn_inp,
  12254. model.layers[il].ffn_norm, NULL,
  12255. LLM_NORM_RMS, il);
  12256. cb(cur, "ffn_norm", il);
  12257. cur = build_ffn(cur,
  12258. model.layers[il].ffn_up, NULL, NULL,
  12259. model.layers[il].ffn_gate, NULL, NULL,
  12260. model.layers[il].ffn_down, NULL, NULL,
  12261. NULL,
  12262. LLM_FFN_SILU, LLM_FFN_PAR, il);
  12263. cb(cur, "ffn_out", il);
  12264. cur = ggml_add(ctx0, cur, ffn_inp);
  12265. cur = build_cvec(cur, il);
  12266. cb(cur, "l_out", il);
  12267. // input for next layer
  12268. inpL = cur;
  12269. }
  12270. cur = inpL;
  12271. cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
  12272. cb(cur, "result_norm", -1);
  12273. res->t_embd = cur;
  12274. cur = build_lora_mm(model.output, cur);
  12275. cb(cur, "result_output", -1);
  12276. res->t_logits = cur;
  12277. ggml_build_forward_expand(gf, cur);
  12278. }
  12279. };
  12280. struct llm_build_granite : public llm_graph_context {
  12281. llm_build_granite(
  12282. const llama_model & model,
  12283. const llm_graph_params & params)
  12284. : llm_graph_context(params) {
  12285. const int64_t n_embd_head = hparams.n_embd_head_v;
  12286. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  12287. GGML_ASSERT(n_embd_head == hparams.n_rot);
  12288. ggml_tensor * cur;
  12289. ggml_tensor * inpL;
  12290. inpL = build_inp_embd(model.tok_embd);
  12291. // inp_pos - built only if rope enabled
  12292. ggml_tensor * inp_pos = nullptr;
  12293. if (hparams.rope_finetuned) {
  12294. inp_pos = build_inp_pos();
  12295. }
  12296. auto * inp_attn = build_attn_inp_kv();
  12297. ggml_tensor * inp_out_ids = build_inp_out_ids();
  12298. for (int il = 0; il < n_layer; ++il) {
  12299. ggml_tensor * inpSA = inpL;
  12300. // norm
  12301. cur = build_norm(inpL,
  12302. model.layers[il].attn_norm, NULL,
  12303. LLM_NORM_RMS, il);
  12304. cb(cur, "attn_norm", il);
  12305. // self-attention
  12306. cur = build_attention_layer(
  12307. cur, inp_pos, inp_attn,
  12308. model, n_embd_head, il);
  12309. if (il == n_layer - 1 && inp_out_ids) {
  12310. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12311. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  12312. }
  12313. // ffn
  12314. cur = build_layer_ffn(cur, inpSA, model, il);
  12315. // input for next layer
  12316. inpL = cur;
  12317. }
  12318. cur = inpL;
  12319. cur = build_norm(cur,
  12320. model.output_norm, NULL,
  12321. LLM_NORM_RMS, -1);
  12322. cb(cur, "result_norm", -1);
  12323. res->t_embd = cur;
  12324. // lm_head
  12325. cur = build_lora_mm(model.output, cur);
  12326. // For Granite architectures - scale logits
  12327. cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
  12328. cb(cur, "result_output", -1);
  12329. res->t_logits = cur;
  12330. ggml_build_forward_expand(gf, cur);
  12331. }
  12332. ggml_tensor * build_attention_layer(
  12333. ggml_tensor * cur,
  12334. ggml_tensor * inp_pos,
  12335. llm_graph_input_attn_kv * inp_attn,
  12336. const llama_model & model,
  12337. const int64_t n_embd_head,
  12338. const int il) {
  12339. // compute Q and K and (optionally) RoPE them
  12340. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  12341. cb(Qcur, "Qcur", il);
  12342. if (model.layers[il].bq) {
  12343. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  12344. cb(Qcur, "Qcur", il);
  12345. }
  12346. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  12347. cb(Kcur, "Kcur", il);
  12348. if (model.layers[il].bk) {
  12349. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  12350. cb(Kcur, "Kcur", il);
  12351. }
  12352. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  12353. cb(Vcur, "Vcur", il);
  12354. if (model.layers[il].bv) {
  12355. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  12356. cb(Vcur, "Vcur", il);
  12357. }
  12358. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
  12359. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
  12360. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
  12361. const bool use_rope = hparams.rope_finetuned;
  12362. if (use_rope) {
  12363. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  12364. Qcur = ggml_rope_ext(
  12365. ctx0, Qcur, inp_pos, rope_factors,
  12366. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12367. ext_factor, attn_factor, beta_fast, beta_slow
  12368. );
  12369. Kcur = ggml_rope_ext(
  12370. ctx0, Kcur, inp_pos, rope_factors,
  12371. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12372. ext_factor, attn_factor, beta_fast, beta_slow
  12373. );
  12374. }
  12375. cb(Qcur, "Qcur", il);
  12376. cb(Kcur, "Kcur", il);
  12377. cb(Vcur, "Vcur", il);
  12378. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  12379. cur = build_attn(inp_attn,
  12380. model.layers[il].wo, model.layers[il].bo,
  12381. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  12382. cb(cur, "attn_out", il);
  12383. return cur;
  12384. }
  12385. ggml_tensor * build_layer_ffn(
  12386. ggml_tensor * cur,
  12387. ggml_tensor * inpSA,
  12388. const llama_model & model,
  12389. const int il) {
  12390. // For Granite architectures - scale residual
  12391. if (hparams.f_residual_scale) {
  12392. cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
  12393. }
  12394. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  12395. cb(ffn_inp, "ffn_inp", il);
  12396. // feed-forward network (non-MoE)
  12397. if (model.layers[il].ffn_gate_inp == nullptr) {
  12398. cur = build_norm(ffn_inp,
  12399. model.layers[il].ffn_norm, NULL,
  12400. LLM_NORM_RMS, il);
  12401. cb(cur, "ffn_norm", il);
  12402. cur = build_ffn(cur,
  12403. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  12404. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  12405. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  12406. NULL,
  12407. LLM_FFN_SILU, LLM_FFN_PAR, il);
  12408. cb(cur, "ffn_out", il);
  12409. } else {
  12410. // MoE branch
  12411. cur = build_norm(ffn_inp,
  12412. model.layers[il].ffn_norm, NULL,
  12413. LLM_NORM_RMS, il);
  12414. cb(cur, "ffn_norm", il);
  12415. ggml_tensor * moe_out = build_moe_ffn(cur,
  12416. model.layers[il].ffn_gate_inp,
  12417. model.layers[il].ffn_up_exps,
  12418. model.layers[il].ffn_gate_exps,
  12419. model.layers[il].ffn_down_exps,
  12420. nullptr,
  12421. n_expert, n_expert_used,
  12422. LLM_FFN_SILU, true,
  12423. false, 0.0,
  12424. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  12425. il);
  12426. cb(moe_out, "ffn_moe_out", il);
  12427. // For Granite MoE Shared
  12428. if (hparams.n_ff_shexp > 0) {
  12429. ggml_tensor * ffn_shexp = build_ffn(cur,
  12430. model.layers[il].ffn_up_shexp, NULL, NULL,
  12431. model.layers[il].ffn_gate_shexp, NULL, NULL,
  12432. model.layers[il].ffn_down_shexp, NULL, NULL,
  12433. NULL,
  12434. LLM_FFN_SILU, LLM_FFN_PAR, il);
  12435. cb(ffn_shexp, "ffn_shexp", il);
  12436. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  12437. cb(cur, "ffn_out", il);
  12438. } else {
  12439. cur = moe_out;
  12440. }
  12441. }
  12442. // For Granite architectures - scale residual
  12443. if (hparams.f_residual_scale) {
  12444. cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
  12445. }
  12446. cur = ggml_add(ctx0, cur, ffn_inp);
  12447. cb(cur, "ffn_out", il);
  12448. cur = build_cvec(cur, il);
  12449. cb(cur, "l_out", il);
  12450. return cur;
  12451. }
  12452. };
  12453. struct llm_build_granite_hybrid : public llm_graph_context_mamba {
  12454. llm_build_granite_hybrid(
  12455. const llama_model & model,
  12456. const llm_graph_params & params) :
  12457. llm_graph_context_mamba(params) {
  12458. const int64_t n_embd_head = hparams.n_embd_head_v;
  12459. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  12460. ggml_tensor * cur;
  12461. ggml_tensor * inpL;
  12462. inpL = build_inp_embd(model.tok_embd);
  12463. auto * inp = build_inp_mem_hybrid();
  12464. ggml_tensor * inp_out_ids = build_inp_out_ids();
  12465. // Positional embeddings populated if rope enabled
  12466. ggml_tensor * inp_pos = nullptr;
  12467. if (hparams.rope_finetuned) {
  12468. inp_pos = build_inp_pos();
  12469. }
  12470. for (int il = 0; il < n_layer; ++il) {
  12471. struct ggml_tensor * inpSA = inpL;
  12472. // norm
  12473. cur = build_norm(inpL,
  12474. model.layers[il].attn_norm, NULL,
  12475. LLM_NORM_RMS, il);
  12476. cb(cur, "attn_norm", il);
  12477. if (hparams.is_recurrent(il)) {
  12478. // ssm layer //
  12479. cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
  12480. } else {
  12481. // attention layer //
  12482. cur = build_attention_layer(
  12483. cur, inp_pos, inp->get_attn(), model,
  12484. n_embd_head, il);
  12485. }
  12486. if (il == n_layer - 1 && inp_out_ids) {
  12487. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12488. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  12489. }
  12490. // ffn
  12491. cur = build_layer_ffn(cur, inpSA, model, il);
  12492. // input for next layer
  12493. inpL = cur;
  12494. }
  12495. cur = inpL;
  12496. cur = build_norm(cur,
  12497. model.output_norm, NULL,
  12498. LLM_NORM_RMS, -1);
  12499. cb(cur, "result_norm", -1);
  12500. res->t_embd = cur;
  12501. // lm_head
  12502. cur = build_lora_mm(model.output, cur);
  12503. // For Granite architectures - scale logits
  12504. if (hparams.f_logit_scale) {
  12505. cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
  12506. }
  12507. cb(cur, "result_output", -1);
  12508. res->t_logits = cur;
  12509. ggml_build_forward_expand(gf, cur);
  12510. }
  12511. ggml_tensor * build_attention_layer(
  12512. ggml_tensor * cur,
  12513. ggml_tensor * inp_pos,
  12514. llm_graph_input_attn_kv * inp_attn,
  12515. const llama_model & model,
  12516. const int64_t n_embd_head,
  12517. const int il) {
  12518. // compute Q and K and (optionally) RoPE them
  12519. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  12520. cb(Qcur, "Qcur", il);
  12521. if (model.layers[il].bq) {
  12522. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  12523. cb(Qcur, "Qcur", il);
  12524. }
  12525. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  12526. cb(Kcur, "Kcur", il);
  12527. if (model.layers[il].bk) {
  12528. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  12529. cb(Kcur, "Kcur", il);
  12530. }
  12531. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  12532. cb(Vcur, "Vcur", il);
  12533. if (model.layers[il].bv) {
  12534. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  12535. cb(Vcur, "Vcur", il);
  12536. }
  12537. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
  12538. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
  12539. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
  12540. const bool use_rope = hparams.rope_finetuned;
  12541. if (use_rope) {
  12542. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  12543. Qcur = ggml_rope_ext(
  12544. ctx0, Qcur, inp_pos, rope_factors,
  12545. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12546. ext_factor, attn_factor, beta_fast, beta_slow
  12547. );
  12548. Kcur = ggml_rope_ext(
  12549. ctx0, Kcur, inp_pos, rope_factors,
  12550. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12551. ext_factor, attn_factor, beta_fast, beta_slow
  12552. );
  12553. }
  12554. cb(Qcur, "Qcur", il);
  12555. cb(Kcur, "Kcur", il);
  12556. cb(Vcur, "Vcur", il);
  12557. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  12558. cur = build_attn(inp_attn,
  12559. model.layers[il].wo, model.layers[il].bo,
  12560. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  12561. cb(cur, "attn_out", il);
  12562. return cur;
  12563. }
  12564. ggml_tensor * build_layer_ffn(
  12565. ggml_tensor * cur,
  12566. ggml_tensor * inpSA,
  12567. const llama_model & model,
  12568. const int il) {
  12569. // For Granite architectures - scale residual
  12570. if (hparams.f_residual_scale) {
  12571. cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
  12572. }
  12573. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  12574. cb(ffn_inp, "ffn_inp", il);
  12575. // feed-forward network (non-MoE)
  12576. if (model.layers[il].ffn_gate_inp == nullptr) {
  12577. cur = build_norm(ffn_inp,
  12578. model.layers[il].ffn_norm, NULL,
  12579. LLM_NORM_RMS, il);
  12580. cb(cur, "ffn_norm", il);
  12581. cur = build_ffn(cur,
  12582. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  12583. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  12584. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  12585. NULL,
  12586. LLM_FFN_SILU, LLM_FFN_PAR, il);
  12587. cb(cur, "ffn_out", il);
  12588. } else {
  12589. // MoE branch
  12590. cur = build_norm(ffn_inp,
  12591. model.layers[il].ffn_norm, NULL,
  12592. LLM_NORM_RMS, il);
  12593. cb(cur, "ffn_norm", il);
  12594. ggml_tensor * moe_out = build_moe_ffn(cur,
  12595. model.layers[il].ffn_gate_inp,
  12596. model.layers[il].ffn_up_exps,
  12597. model.layers[il].ffn_gate_exps,
  12598. model.layers[il].ffn_down_exps,
  12599. nullptr,
  12600. n_expert, n_expert_used,
  12601. LLM_FFN_SILU, true,
  12602. false, 0.0,
  12603. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  12604. il);
  12605. cb(moe_out, "ffn_moe_out", il);
  12606. // For Granite MoE Shared
  12607. if (hparams.n_ff_shexp > 0) {
  12608. ggml_tensor * ffn_shexp = build_ffn(cur,
  12609. model.layers[il].ffn_up_shexp, NULL, NULL,
  12610. model.layers[il].ffn_gate_shexp, NULL, NULL,
  12611. model.layers[il].ffn_down_shexp, NULL, NULL,
  12612. NULL,
  12613. LLM_FFN_SILU, LLM_FFN_PAR, il);
  12614. cb(ffn_shexp, "ffn_shexp", il);
  12615. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  12616. cb(cur, "ffn_out", il);
  12617. } else {
  12618. cur = moe_out;
  12619. }
  12620. }
  12621. // For Granite architectures - scale residual
  12622. if (hparams.f_residual_scale) {
  12623. cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
  12624. }
  12625. cur = ggml_add(ctx0, cur, ffn_inp);
  12626. cb(cur, "ffn_out", il);
  12627. cur = build_cvec(cur, il);
  12628. cb(cur, "l_out", il);
  12629. return cur;
  12630. }
  12631. };
  12632. // ref: https://github.com/facebookresearch/chameleon
  12633. // based on the original build_llama() function, changes:
  12634. // * qk-norm
  12635. // * swin-norm
  12636. // * removed bias
  12637. // * removed MoE
  12638. struct llm_build_chameleon : public llm_graph_context {
  12639. llm_build_chameleon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  12640. const int64_t n_embd_head = hparams.n_embd_head_v;
  12641. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  12642. GGML_ASSERT(n_embd_head == hparams.n_rot);
  12643. ggml_tensor * cur;
  12644. ggml_tensor * inpL;
  12645. inpL = build_inp_embd(model.tok_embd);
  12646. // inp_pos - contains the positions
  12647. ggml_tensor * inp_pos = build_inp_pos();
  12648. auto * inp_attn = build_attn_inp_kv();
  12649. ggml_tensor * inp_out_ids = build_inp_out_ids();
  12650. for (int il = 0; il < n_layer; ++il) {
  12651. ggml_tensor * inpSA = inpL;
  12652. // norm
  12653. if (hparams.swin_norm) {
  12654. cur = inpL;
  12655. } else {
  12656. cur = build_norm(inpL,
  12657. model.layers[il].attn_norm, NULL,
  12658. LLM_NORM_RMS, il);
  12659. cb(cur, "attn_norm", il);
  12660. }
  12661. // self-attention
  12662. {
  12663. // compute Q and K and RoPE them
  12664. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  12665. cb(Qcur, "Qcur", il);
  12666. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  12667. cb(Kcur, "Kcur", il);
  12668. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  12669. cb(Vcur, "Vcur", il);
  12670. if (model.layers[il].attn_q_norm) {
  12671. Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
  12672. ggml_element_size(Qcur) * n_embd_head,
  12673. ggml_element_size(Qcur) * n_embd_head * n_head,
  12674. 0);
  12675. cb(Qcur, "Qcur", il);
  12676. Qcur = build_norm(Qcur,
  12677. model.layers[il].attn_q_norm,
  12678. model.layers[il].attn_q_norm_b,
  12679. LLM_NORM, il);
  12680. cb(Qcur, "Qcur", il);
  12681. }
  12682. if (model.layers[il].attn_k_norm) {
  12683. Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
  12684. ggml_element_size(Kcur) * n_embd_head,
  12685. ggml_element_size(Kcur) * n_embd_head * n_head_kv,
  12686. 0);
  12687. cb(Kcur, "Kcur", il);
  12688. Kcur = build_norm(Kcur,
  12689. model.layers[il].attn_k_norm,
  12690. model.layers[il].attn_k_norm_b,
  12691. LLM_NORM, il);
  12692. cb(Kcur, "Kcur", il);
  12693. }
  12694. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  12695. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  12696. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  12697. Qcur = ggml_rope_ext(
  12698. ctx0, Qcur, inp_pos, nullptr,
  12699. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12700. ext_factor, attn_factor, beta_fast, beta_slow
  12701. );
  12702. Kcur = ggml_rope_ext(
  12703. ctx0, Kcur, inp_pos, nullptr,
  12704. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12705. ext_factor, attn_factor, beta_fast, beta_slow
  12706. );
  12707. cb(Qcur, "Qcur", il);
  12708. cb(Kcur, "Kcur", il);
  12709. cb(Vcur, "Vcur", il);
  12710. cur = build_attn(inp_attn,
  12711. model.layers[il].wo, nullptr,
  12712. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  12713. }
  12714. if (il == n_layer - 1 && inp_out_ids) {
  12715. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12716. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  12717. }
  12718. if (hparams.swin_norm) {
  12719. cur = build_norm(cur,
  12720. model.layers[il].attn_norm, NULL,
  12721. LLM_NORM_RMS, il);
  12722. }
  12723. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  12724. cb(ffn_inp, "ffn_inp", il);
  12725. // feed-forward network
  12726. if (!hparams.swin_norm) {
  12727. cur = build_norm(ffn_inp,
  12728. model.layers[il].ffn_norm, NULL,
  12729. LLM_NORM_RMS, il);
  12730. cb(cur, "ffn_norm", il);
  12731. }
  12732. cur = build_ffn(cur,
  12733. model.layers[il].ffn_up, NULL, NULL,
  12734. model.layers[il].ffn_gate, NULL, NULL,
  12735. model.layers[il].ffn_down, NULL, NULL,
  12736. NULL,
  12737. LLM_FFN_SILU, LLM_FFN_PAR, il);
  12738. cb(cur, "ffn_out", il);
  12739. if (hparams.swin_norm) {
  12740. cur = build_norm(cur,
  12741. model.layers[il].ffn_norm, NULL,
  12742. LLM_NORM_RMS, il);
  12743. cb(cur, "ffn_norm", il);
  12744. }
  12745. cur = ggml_add(ctx0, cur, ffn_inp);
  12746. cb(cur, "ffn_out", il);
  12747. cur = build_cvec(cur, il);
  12748. cb(cur, "l_out", il);
  12749. // input for next layer
  12750. inpL = cur;
  12751. }
  12752. cur = inpL;
  12753. cur = build_norm(cur,
  12754. model.output_norm, NULL,
  12755. LLM_NORM_RMS, -1);
  12756. cb(cur, "result_norm", -1);
  12757. res->t_embd = cur;
  12758. // lm_head
  12759. cur = build_lora_mm(model.output, cur);
  12760. cb(cur, "result_output_with_img_logits", -1);
  12761. // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
  12762. // Needs to be removed once image outputs are supported.
  12763. int img_token_end_idx = 8196;
  12764. int img_token_start_idx = 4;
  12765. int num_img_tokens = img_token_end_idx - img_token_start_idx;
  12766. // creates 1d tensor of size num_img_tokens and values -FLT_MAX,
  12767. // which ensures that text token values are always at least larger than image token values
  12768. ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
  12769. img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
  12770. cb(img_logits, "img_logits", -1);
  12771. cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
  12772. cb(cur, "result_output", -1);
  12773. res->t_logits = cur;
  12774. ggml_build_forward_expand(gf, cur);
  12775. }
  12776. };
  12777. struct llm_build_wavtokenizer_dec : public llm_graph_context {
  12778. llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  12779. ggml_tensor * cur;
  12780. ggml_tensor * inpL;
  12781. inpL = build_inp_embd(model.tok_embd);
  12782. cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
  12783. cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
  12784. cur = ggml_add(ctx0, cur, model.conv1d_b);
  12785. // posnet
  12786. for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
  12787. const auto & layer = model.layers[il].posnet;
  12788. inpL = cur;
  12789. switch (il) {
  12790. case 0:
  12791. case 1:
  12792. case 3:
  12793. case 4:
  12794. {
  12795. cur = build_norm(cur,
  12796. layer.norm1,
  12797. layer.norm1_b,
  12798. LLM_NORM_GROUP, 0);
  12799. cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
  12800. cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
  12801. cur = ggml_add(ctx0, cur, layer.conv1_b);
  12802. cur = build_norm(cur,
  12803. layer.norm2,
  12804. layer.norm2_b,
  12805. LLM_NORM_GROUP, 0);
  12806. cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
  12807. cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
  12808. cur = ggml_add(ctx0, cur, layer.conv2_b);
  12809. cur = ggml_add(ctx0, cur, inpL);
  12810. } break;
  12811. case 2:
  12812. {
  12813. cur = build_norm(cur,
  12814. layer.attn_norm,
  12815. layer.attn_norm_b,
  12816. LLM_NORM_GROUP, 0);
  12817. ggml_tensor * q;
  12818. ggml_tensor * k;
  12819. ggml_tensor * v;
  12820. q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
  12821. k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
  12822. v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
  12823. q = ggml_add(ctx0, q, layer.attn_q_b);
  12824. k = ggml_add(ctx0, k, layer.attn_k_b);
  12825. v = ggml_add(ctx0, v, layer.attn_v_b);
  12826. q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
  12827. k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
  12828. ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
  12829. kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
  12830. cur = ggml_mul_mat(ctx0, kq, v);
  12831. cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
  12832. cur = ggml_add(ctx0, cur, layer.attn_o_b);
  12833. cur = ggml_add(ctx0, cur, inpL);
  12834. } break;
  12835. case 5:
  12836. {
  12837. cur = build_norm(cur,
  12838. layer.norm,
  12839. layer.norm_b,
  12840. LLM_NORM_GROUP, 0);
  12841. } break;
  12842. default: GGML_ABORT("unknown posnet layer");
  12843. };
  12844. }
  12845. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  12846. cur = build_norm(cur,
  12847. model.tok_norm,
  12848. model.tok_norm_b,
  12849. LLM_NORM, -1);
  12850. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  12851. inpL = cur;
  12852. // convnext
  12853. for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
  12854. const auto & layer = model.layers[il].convnext;
  12855. cur = inpL;
  12856. cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
  12857. cur = ggml_add(ctx0, cur, layer.dw_b);
  12858. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  12859. cur = build_norm(cur,
  12860. layer.norm,
  12861. layer.norm_b,
  12862. LLM_NORM, -1);
  12863. cur = build_ffn(cur,
  12864. layer.pw1, layer.pw1_b, NULL,
  12865. NULL, NULL, NULL,
  12866. layer.pw2, layer.pw2_b, NULL,
  12867. NULL,
  12868. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  12869. cur = ggml_mul(ctx0, cur, layer.gamma);
  12870. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  12871. inpL = ggml_add(ctx0, cur, inpL);
  12872. }
  12873. cur = inpL;
  12874. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  12875. cur = build_norm(cur,
  12876. model.output_norm,
  12877. model.output_norm_b,
  12878. LLM_NORM, -1);
  12879. // lm_head
  12880. cur = build_lora_mm(model.output, cur);
  12881. cur = ggml_add(ctx0, cur, model.output_b);
  12882. cb(cur, "result_embd", -1);
  12883. res->t_embd = cur;
  12884. ggml_build_forward_expand(gf, cur);
  12885. }
  12886. };
  12887. struct llm_build_plm : public llm_graph_context {
  12888. llm_build_plm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  12889. const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
  12890. const uint32_t n_embd_head_qk_rope = hparams.n_rot;
  12891. const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  12892. const uint32_t kv_lora_rank = hparams.n_lora_kv;
  12893. ggml_tensor * cur;
  12894. ggml_tensor * inpL;
  12895. // {n_embd, n_tokens}
  12896. inpL = build_inp_embd(model.tok_embd);
  12897. // inp_pos - contains the positions
  12898. ggml_tensor * inp_pos = build_inp_pos();
  12899. auto * inp_attn = build_attn_inp_kv();
  12900. ggml_tensor * inp_out_ids = build_inp_out_ids();
  12901. for (int il = 0; il < n_layer; ++il) {
  12902. ggml_tensor * inpSA = inpL;
  12903. // norm
  12904. cur = build_norm(inpL,
  12905. model.layers[il].attn_norm, NULL,
  12906. LLM_NORM_RMS, il);
  12907. cb(cur, "attn_norm", il);
  12908. // self_attention
  12909. {
  12910. ggml_tensor * q = NULL;
  12911. q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
  12912. cb(q, "q", il);
  12913. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  12914. ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
  12915. ggml_row_size(q->type, hparams.n_embd_head_k),
  12916. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  12917. 0);
  12918. cb(q_nope, "q_nope", il);
  12919. // and {n_head * n_embd_head_qk_rope, n_tokens}
  12920. ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
  12921. ggml_row_size(q->type, hparams.n_embd_head_k),
  12922. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  12923. ggml_row_size(q->type, n_embd_head_qk_nope));
  12924. cb(q_pe, "q_pe", il);
  12925. // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
  12926. ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
  12927. cb(kv_pe_compresseed, "kv_pe_compresseed", il);
  12928. // split into {kv_lora_rank, n_tokens}
  12929. ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
  12930. kv_pe_compresseed->nb[1],
  12931. 0);
  12932. cb(kv_compressed, "kv_compressed", il);
  12933. // and {n_embd_head_qk_rope, n_tokens}
  12934. ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
  12935. kv_pe_compresseed->nb[1],
  12936. kv_pe_compresseed->nb[1],
  12937. ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
  12938. cb(k_pe, "k_pe", il);
  12939. kv_compressed = build_norm(kv_compressed,
  12940. model.layers[il].attn_kv_a_norm, NULL,
  12941. LLM_NORM_RMS, il);
  12942. cb(kv_compressed, "kv_compressed", il);
  12943. // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
  12944. ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
  12945. cb(kv, "kv", il);
  12946. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  12947. ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
  12948. ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
  12949. ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  12950. 0);
  12951. cb(k_nope, "k_nope", il);
  12952. // and {n_head * n_embd_head_v, n_tokens}
  12953. ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
  12954. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  12955. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
  12956. ggml_row_size(kv->type, (n_embd_head_qk_nope)));
  12957. cb(v_states, "v_states", il);
  12958. v_states = ggml_cont(ctx0, v_states);
  12959. cb(v_states, "v_states", il);
  12960. v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
  12961. ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
  12962. 0);
  12963. cb(v_states, "v_states", il);
  12964. q_pe = ggml_rope_ext(
  12965. ctx0, q_pe, inp_pos, nullptr,
  12966. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12967. ext_factor, attn_factor, beta_fast, beta_slow
  12968. );
  12969. cb(q_pe, "q_pe", il);
  12970. // shared RoPE key
  12971. k_pe = ggml_rope_ext(
  12972. ctx0, k_pe, inp_pos, nullptr,
  12973. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12974. ext_factor, attn_factor, beta_fast, beta_slow
  12975. );
  12976. cb(k_pe, "k_pe", il);
  12977. ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
  12978. cb(q_states, "q_states", il);
  12979. ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
  12980. cb(k_states, "k_states", il);
  12981. cur = build_attn(inp_attn,
  12982. model.layers[il].wo, NULL,
  12983. q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
  12984. }
  12985. if (il == n_layer - 1 && inp_out_ids) {
  12986. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12987. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  12988. }
  12989. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  12990. cb(ffn_inp, "ffn_inp", il);
  12991. cur = build_norm(ffn_inp,
  12992. model.layers[il].ffn_norm, NULL,
  12993. LLM_NORM_RMS, il);
  12994. cb(cur, "ffn_norm", il);
  12995. cur = build_ffn(cur,
  12996. model.layers[il].ffn_up, NULL, NULL,
  12997. NULL, NULL, NULL,
  12998. model.layers[il].ffn_down, NULL, NULL,
  12999. NULL,
  13000. LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
  13001. cb(cur, "ffn_out", il);
  13002. cur = ggml_add(ctx0, cur, ffn_inp);
  13003. cur = build_cvec(cur, il);
  13004. cb(cur, "l_out", il);
  13005. // input for next layer
  13006. inpL = cur;
  13007. }
  13008. cur = inpL;
  13009. cur = build_norm(cur,
  13010. model.output_norm, NULL,
  13011. LLM_NORM_RMS, -1);
  13012. cb(cur, "result_norm", -1);
  13013. res->t_embd = cur;
  13014. cur = build_lora_mm(model.output, cur);
  13015. cb(cur, "result_output", -1);
  13016. res->t_logits = cur;
  13017. ggml_build_forward_expand(gf, cur);
  13018. }
  13019. };
  13020. struct llm_build_bailingmoe : public llm_graph_context {
  13021. llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  13022. ggml_tensor * cur;
  13023. ggml_tensor * inpL;
  13024. inpL = build_inp_embd(model.tok_embd);
  13025. // inp_pos - contains the positions
  13026. ggml_tensor * inp_pos = build_inp_pos();
  13027. auto * inp_attn = build_attn_inp_kv();
  13028. ggml_tensor * inp_out_ids = build_inp_out_ids();
  13029. for (int il = 0; il < n_layer; ++il) {
  13030. ggml_tensor * inpSA = inpL;
  13031. // norm
  13032. cur = build_norm(inpL,
  13033. model.layers[il].attn_norm, NULL,
  13034. LLM_NORM_RMS, il);
  13035. cb(cur, "attn_norm", il);
  13036. // self-attention
  13037. {
  13038. // rope freq factors for llama3; may return nullptr for llama2 and other models
  13039. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  13040. // compute Q and K and RoPE them
  13041. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  13042. cb(Qcur, "Qcur", il);
  13043. if (model.layers[il].bq) {
  13044. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  13045. cb(Qcur, "Qcur", il);
  13046. }
  13047. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  13048. cb(Kcur, "Kcur", il);
  13049. if (model.layers[il].bk) {
  13050. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  13051. cb(Kcur, "Kcur", il);
  13052. }
  13053. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  13054. cb(Vcur, "Vcur", il);
  13055. if (model.layers[il].bv) {
  13056. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  13057. cb(Vcur, "Vcur", il);
  13058. }
  13059. Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
  13060. Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
  13061. Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
  13062. Qcur = ggml_rope_ext(
  13063. ctx0, Qcur, inp_pos, rope_factors,
  13064. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13065. ext_factor, attn_factor, beta_fast, beta_slow
  13066. );
  13067. Kcur = ggml_rope_ext(
  13068. ctx0, Kcur, inp_pos, rope_factors,
  13069. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13070. ext_factor, attn_factor, beta_fast, beta_slow
  13071. );
  13072. cb(Qcur, "Qcur", il);
  13073. cb(Kcur, "Kcur", il);
  13074. cb(Vcur, "Vcur", il);
  13075. cur = build_attn(inp_attn,
  13076. model.layers[il].wo, model.layers[il].bo,
  13077. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
  13078. }
  13079. if (il == n_layer - 1 && inp_out_ids) {
  13080. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  13081. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  13082. }
  13083. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  13084. cb(ffn_inp, "ffn_inp", il);
  13085. cur = build_norm(ffn_inp,
  13086. model.layers[il].ffn_norm, NULL,
  13087. LLM_NORM_RMS, il);
  13088. cb(cur, "ffn_norm", il);
  13089. ggml_tensor * moe_out =
  13090. build_moe_ffn(cur,
  13091. model.layers[il].ffn_gate_inp,
  13092. model.layers[il].ffn_up_exps,
  13093. model.layers[il].ffn_gate_exps,
  13094. model.layers[il].ffn_down_exps,
  13095. nullptr,
  13096. n_expert, n_expert_used,
  13097. LLM_FFN_SILU, hparams.expert_weights_norm,
  13098. false, hparams.expert_weights_scale,
  13099. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  13100. il);
  13101. cb(moe_out, "ffn_moe_out", il);
  13102. // FFN shared expert
  13103. {
  13104. ggml_tensor * ffn_shexp = build_ffn(cur,
  13105. model.layers[il].ffn_up_shexp, NULL, NULL,
  13106. model.layers[il].ffn_gate_shexp, NULL, NULL,
  13107. model.layers[il].ffn_down_shexp, NULL, NULL,
  13108. NULL,
  13109. LLM_FFN_SILU, LLM_FFN_PAR, il);
  13110. cb(ffn_shexp, "ffn_shexp", il);
  13111. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  13112. cb(cur, "ffn_out", il);
  13113. }
  13114. cur = ggml_add(ctx0, cur, ffn_inp);
  13115. cur = build_cvec(cur, il);
  13116. cb(cur, "l_out", il);
  13117. // input for next layer
  13118. inpL = cur;
  13119. }
  13120. cur = inpL;
  13121. cur = build_norm(cur,
  13122. model.output_norm, NULL,
  13123. LLM_NORM_RMS, -1);
  13124. cb(cur, "result_norm", -1);
  13125. res->t_embd = cur;
  13126. // lm_head
  13127. cur = build_lora_mm(model.output, cur);
  13128. cb(cur, "result_output", -1);
  13129. res->t_logits = cur;
  13130. ggml_build_forward_expand(gf, cur);
  13131. }
  13132. };
  13133. struct llm_build_dots1 : public llm_graph_context {
  13134. llm_build_dots1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  13135. const int64_t n_embd_head = hparams.n_embd_head_v;
  13136. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  13137. GGML_ASSERT(n_embd_head == hparams.n_rot);
  13138. ggml_tensor * cur;
  13139. ggml_tensor * inpL;
  13140. inpL = build_inp_embd(model.tok_embd);
  13141. // inp_pos - contains the positions
  13142. ggml_tensor * inp_pos = build_inp_pos();
  13143. auto * inp_attn = build_attn_inp_kv();
  13144. ggml_tensor * inp_out_ids = build_inp_out_ids();
  13145. for (int il = 0; il < n_layer; ++il) {
  13146. ggml_tensor * inpSA = inpL;
  13147. // norm
  13148. cur = build_norm(inpL,
  13149. model.layers[il].attn_norm, NULL,
  13150. LLM_NORM_RMS, il);
  13151. cb(cur, "attn_norm", il);
  13152. // self_attention
  13153. {
  13154. // compute Q and K and RoPE them
  13155. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  13156. cb(Qcur, "Qcur", il);
  13157. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  13158. cb(Kcur, "Kcur", il);
  13159. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  13160. cb(Vcur, "Vcur", il);
  13161. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  13162. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  13163. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  13164. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  13165. cb(Qcur, "Qcur_normed", il);
  13166. Qcur = ggml_rope_ext(
  13167. ctx0, Qcur, inp_pos, nullptr,
  13168. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13169. ext_factor, attn_factor, beta_fast, beta_slow
  13170. );
  13171. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  13172. cb(Kcur, "Kcur_normed", il);
  13173. Kcur = ggml_rope_ext(
  13174. ctx0, Kcur, inp_pos, nullptr,
  13175. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13176. ext_factor, attn_factor, beta_fast, beta_slow
  13177. );
  13178. cb(Qcur, "Qcur", il);
  13179. cb(Kcur, "Kcur", il);
  13180. cb(Vcur, "Vcur", il);
  13181. cur = build_attn(inp_attn,
  13182. model.layers[il].wo, model.layers[il].bo,
  13183. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  13184. }
  13185. if (il == n_layer - 1 && inp_out_ids) {
  13186. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  13187. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  13188. }
  13189. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  13190. cb(ffn_inp, "ffn_inp", il);
  13191. // MoE branch
  13192. cur = build_norm(ffn_inp,
  13193. model.layers[il].ffn_norm, NULL,
  13194. LLM_NORM_RMS, il);
  13195. cb(cur, "ffn_norm", il);
  13196. if ((uint32_t) il < hparams.n_layer_dense_lead) {
  13197. cur = build_ffn(cur,
  13198. model.layers[il].ffn_up, NULL, NULL,
  13199. model.layers[il].ffn_gate, NULL, NULL,
  13200. model.layers[il].ffn_down, NULL, NULL,
  13201. NULL,
  13202. LLM_FFN_SILU, LLM_FFN_PAR, il);
  13203. cb(cur, "ffn_out", il);
  13204. } else {
  13205. ggml_tensor * moe_out =
  13206. build_moe_ffn(cur,
  13207. model.layers[il].ffn_gate_inp,
  13208. model.layers[il].ffn_up_exps,
  13209. model.layers[il].ffn_gate_exps,
  13210. model.layers[il].ffn_down_exps,
  13211. model.layers[il].ffn_exp_probs_b,
  13212. n_expert, n_expert_used,
  13213. LLM_FFN_SILU, hparams.expert_weights_norm,
  13214. true, hparams.expert_weights_scale,
  13215. (llama_expert_gating_func_type) hparams.expert_gating_func,
  13216. il);
  13217. cb(moe_out, "ffn_moe_out", il);
  13218. {
  13219. ggml_tensor * ffn_shexp = build_ffn(cur,
  13220. model.layers[il].ffn_up_shexp, NULL, NULL,
  13221. model.layers[il].ffn_gate_shexp, NULL, NULL,
  13222. model.layers[il].ffn_down_shexp, NULL, NULL,
  13223. NULL,
  13224. LLM_FFN_SILU, LLM_FFN_PAR, il);
  13225. cb(ffn_shexp, "ffn_shexp", il);
  13226. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  13227. cb(cur, "ffn_out", il);
  13228. }
  13229. }
  13230. cur = ggml_add(ctx0, cur, ffn_inp);
  13231. cur = build_cvec(cur, il);
  13232. cb(cur, "l_out", il);
  13233. // input for next layer
  13234. inpL = cur;
  13235. }
  13236. cur = inpL;
  13237. cur = build_norm(cur,
  13238. model.output_norm, NULL,
  13239. LLM_NORM_RMS, -1);
  13240. cb(cur, "result_norm", -1);
  13241. res->t_embd = cur;
  13242. // lm_head
  13243. cur = build_lora_mm(model.output, cur);
  13244. cb(cur, "result_output", -1);
  13245. res->t_logits = cur;
  13246. ggml_build_forward_expand(gf, cur);
  13247. }
  13248. };
  13249. struct llm_build_ernie4_5 : public llm_graph_context {
  13250. llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  13251. const int64_t n_embd_head = hparams.n_embd_head_v;
  13252. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  13253. GGML_ASSERT(n_embd_head == hparams.n_rot);
  13254. ggml_tensor * cur;
  13255. ggml_tensor * inpL;
  13256. inpL = build_inp_embd(model.tok_embd);
  13257. // inp_pos - contains the positions
  13258. ggml_tensor * inp_pos = build_inp_pos();
  13259. auto * inp_attn = build_attn_inp_kv();
  13260. for (int il = 0; il < n_layer; ++il) {
  13261. ggml_tensor * inpSA = inpL;
  13262. // norm
  13263. {
  13264. cur = build_norm(inpL,
  13265. model.layers[il].attn_norm, NULL,
  13266. LLM_NORM_RMS, il);
  13267. cb(cur, "attn_norm", il);
  13268. }
  13269. // self-attention
  13270. {
  13271. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  13272. cb(Qcur, "Qcur", il);
  13273. if (model.layers[il].bq) {
  13274. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  13275. cb(Qcur, "Qcur", il);
  13276. }
  13277. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  13278. cb(Kcur, "Kcur", il);
  13279. if (model.layers[il].bk) {
  13280. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  13281. cb(Kcur, "Kcur", il);
  13282. }
  13283. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  13284. cb(Vcur, "Vcur", il);
  13285. if (model.layers[il].bv) {
  13286. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  13287. cb(Vcur, "Vcur", il);
  13288. }
  13289. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  13290. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  13291. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  13292. Qcur = ggml_rope_ext(
  13293. ctx0, Qcur, inp_pos, nullptr,
  13294. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13295. ext_factor, attn_factor, beta_fast, beta_slow
  13296. );
  13297. Kcur = ggml_rope_ext(
  13298. ctx0, Kcur, inp_pos, nullptr,
  13299. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13300. ext_factor, attn_factor, beta_fast, beta_slow
  13301. );
  13302. cb(Qcur, "Qcur", il);
  13303. cb(Kcur, "Kcur", il);
  13304. cb(Vcur, "Vcur", il);
  13305. cur = build_attn(inp_attn,
  13306. model.layers[il].wo, NULL,
  13307. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  13308. }
  13309. if (il == n_layer - 1) {
  13310. // skip computing output for unused tokens
  13311. ggml_tensor * inp_out_ids = build_inp_out_ids();
  13312. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  13313. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  13314. }
  13315. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  13316. cb(ffn_inp, "ffn_inp", il);
  13317. // feed-forward network
  13318. {
  13319. cur = build_norm(ffn_inp,
  13320. model.layers[il].ffn_norm, NULL,
  13321. LLM_NORM_RMS, il);
  13322. cb(cur, "ffn_norm", il);
  13323. cur = build_ffn(cur,
  13324. model.layers[il].ffn_up, NULL, NULL,
  13325. model.layers[il].ffn_gate, NULL, NULL,
  13326. model.layers[il].ffn_down, NULL, NULL,
  13327. NULL,
  13328. LLM_FFN_SILU, LLM_FFN_PAR, il);
  13329. cb(cur, "ffn_out", il);
  13330. }
  13331. cur = ggml_add(ctx0, cur, ffn_inp);
  13332. cur = build_cvec(cur, il);
  13333. cb(cur, "l_out", il);
  13334. // input for next layer
  13335. inpL = cur;
  13336. }
  13337. cur = inpL;
  13338. cur = build_norm(cur,
  13339. model.output_norm, NULL,
  13340. LLM_NORM_RMS, -1);
  13341. cb(cur, "result_norm", -1);
  13342. res->t_embd = cur;
  13343. // lm_head
  13344. cur = build_lora_mm(model.output, cur);
  13345. cb(cur, "result_output", -1);
  13346. res->t_logits = cur;
  13347. ggml_build_forward_expand(gf, cur);
  13348. }
  13349. };
  13350. struct llm_build_ernie4_5_moe : public llm_graph_context {
  13351. llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  13352. const int64_t n_embd_head = hparams.n_embd_head_v;
  13353. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  13354. GGML_ASSERT(n_embd_head == hparams.n_rot);
  13355. ggml_tensor * cur;
  13356. ggml_tensor * inpL;
  13357. inpL = build_inp_embd(model.tok_embd);
  13358. // inp_pos - contains the positions
  13359. ggml_tensor * inp_pos = build_inp_pos();
  13360. auto * inp_attn = build_attn_inp_kv();
  13361. ggml_tensor * inp_out_ids = build_inp_out_ids();
  13362. GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Ernie 4.5 MoE requires n_moe_layer_step > 0");
  13363. for (int il = 0; il < n_layer; ++il) {
  13364. ggml_tensor * inpSA = inpL;
  13365. // norm
  13366. {
  13367. cur = build_norm(inpL,
  13368. model.layers[il].attn_norm, NULL,
  13369. LLM_NORM_RMS, il);
  13370. cb(cur, "attn_norm", il);
  13371. }
  13372. // self-attention
  13373. {
  13374. // compute Q and K and RoPE them
  13375. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  13376. cb(Qcur, "Qcur", il);
  13377. if (model.layers[il].bq) {
  13378. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  13379. cb(Qcur, "Qcur", il);
  13380. }
  13381. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  13382. cb(Kcur, "Kcur", il);
  13383. if (model.layers[il].bk) {
  13384. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  13385. cb(Kcur, "Kcur", il);
  13386. }
  13387. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  13388. cb(Vcur, "Vcur", il);
  13389. if (model.layers[il].bv) {
  13390. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  13391. cb(Vcur, "Vcur", il);
  13392. }
  13393. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  13394. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  13395. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  13396. Qcur = ggml_rope_ext(
  13397. ctx0, Qcur, inp_pos, nullptr,
  13398. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13399. ext_factor, attn_factor, beta_fast, beta_slow
  13400. );
  13401. Kcur = ggml_rope_ext(
  13402. ctx0, Kcur, inp_pos, nullptr,
  13403. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13404. ext_factor, attn_factor, beta_fast, beta_slow
  13405. );
  13406. cb(Qcur, "Qcur", il);
  13407. cb(Kcur, "Kcur", il);
  13408. cb(Vcur, "Vcur", il);
  13409. cur = build_attn(inp_attn,
  13410. model.layers[il].wo, NULL,
  13411. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  13412. cb(cur, "attn_out", il);
  13413. }
  13414. if (il == n_layer - 1 && inp_out_ids) {
  13415. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  13416. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  13417. }
  13418. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  13419. cb(ffn_inp, "ffn_inp", il);
  13420. // feed-forward network
  13421. bool is_moe_layer = static_cast<uint32_t>(il) >= hparams.n_layer_dense_lead && (il + 1) % hparams.n_moe_layer_step == 0;
  13422. if (!is_moe_layer) {
  13423. cur = build_norm(ffn_inp,
  13424. model.layers[il].ffn_norm, NULL,
  13425. LLM_NORM_RMS, il);
  13426. cb(cur, "ffn_norm", il);
  13427. cur = build_ffn(cur,
  13428. model.layers[il].ffn_up, NULL, NULL,
  13429. model.layers[il].ffn_gate, NULL, NULL,
  13430. model.layers[il].ffn_down, NULL, NULL,
  13431. NULL,
  13432. LLM_FFN_SILU, LLM_FFN_PAR, il);
  13433. cb(cur, "ffn_out", il);
  13434. } else {
  13435. // MoE branch
  13436. cur = build_norm(ffn_inp,
  13437. model.layers[il].ffn_norm, NULL,
  13438. LLM_NORM_RMS, il);
  13439. cb(cur, "ffn_norm", il);
  13440. ggml_tensor * moe_out = build_moe_ffn(cur,
  13441. model.layers[il].ffn_gate_inp,
  13442. model.layers[il].ffn_up_exps,
  13443. model.layers[il].ffn_gate_exps,
  13444. model.layers[il].ffn_down_exps,
  13445. model.layers[il].ffn_exp_probs_b,
  13446. n_expert, n_expert_used,
  13447. LLM_FFN_SILU, true,
  13448. false, 0.0,
  13449. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  13450. il);
  13451. cb(moe_out, "ffn_moe_out", il);
  13452. // Shared expert (if present)
  13453. if (hparams.n_ff_shexp > 0) {
  13454. ggml_tensor * ffn_shexp = build_ffn(cur,
  13455. model.layers[il].ffn_up_shexp, NULL, NULL,
  13456. model.layers[il].ffn_gate_shexp, NULL, NULL,
  13457. model.layers[il].ffn_down_shexp, NULL, NULL,
  13458. NULL,
  13459. LLM_FFN_SILU, LLM_FFN_PAR, il);
  13460. cb(ffn_shexp, "ffn_shexp", il);
  13461. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  13462. } else {
  13463. cur = moe_out;
  13464. }
  13465. cb(cur, "ffn_out", il);
  13466. }
  13467. cur = ggml_add(ctx0, cur, ffn_inp);
  13468. cb(cur, "ffn_out", il);
  13469. cur = build_cvec(cur, il);
  13470. cb(cur, "l_out", il);
  13471. // input for next layer
  13472. inpL = cur;
  13473. }
  13474. cur = inpL;
  13475. cur = build_norm(cur,
  13476. model.output_norm, NULL,
  13477. LLM_NORM_RMS, -1);
  13478. cb(cur, "result_norm", -1);
  13479. res->t_embd = cur;
  13480. // lm_head
  13481. cur = build_lora_mm(model.output, cur);
  13482. cb(cur, "result_output", -1);
  13483. res->t_logits = cur;
  13484. ggml_build_forward_expand(gf, cur);
  13485. }
  13486. };
  13487. struct llm_build_falcon_h1 : public llm_graph_context_mamba {
  13488. llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
  13489. const int64_t n_embd_head = hparams.n_embd_head_v;
  13490. ggml_tensor * cur;
  13491. ggml_tensor * inpL;
  13492. inpL = build_inp_embd(model.tok_embd);
  13493. // inp_pos - contains the positions
  13494. ggml_tensor * inp_pos = build_inp_pos();
  13495. // Build the inputs in the recurrent & kv cache
  13496. auto * inp = build_inp_mem_hybrid();
  13497. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  13498. ggml_tensor * inp_out_ids = build_inp_out_ids();
  13499. for (int il = 0; il < n_layer; ++il) {
  13500. ggml_tensor * inpSA = inpL;
  13501. cur = build_norm(inpL,
  13502. model.layers[il].attn_norm, NULL,
  13503. LLM_NORM_RMS, il);
  13504. cb(cur, "attn_norm", il);
  13505. // self-attention
  13506. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  13507. cb(Qcur, "Qcur", il);
  13508. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  13509. cb(Kcur, "Kcur", il);
  13510. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  13511. cb(Vcur, "Vcur", il);
  13512. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  13513. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  13514. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  13515. Qcur = ggml_rope_ext(
  13516. ctx0, Qcur, inp_pos, nullptr,
  13517. n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
  13518. ext_factor, attn_factor, beta_fast, beta_slow);
  13519. Kcur = ggml_rope_ext(
  13520. ctx0, Kcur, inp_pos, nullptr,
  13521. n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
  13522. ext_factor, attn_factor, beta_fast, beta_slow
  13523. );
  13524. cb(Qcur, "Qcur-post-rope", il);
  13525. cb(Kcur, "Kcur-post-rope", il);
  13526. cb(Vcur, "Vcur-post-rope", il);
  13527. ggml_tensor * attn_out = build_attn(inp->get_attn(),
  13528. model.layers[il].wo, NULL,
  13529. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  13530. cb(attn_out, "attn_out", il);
  13531. cur = build_norm(inpL,
  13532. model.layers[il].attn_norm, NULL,
  13533. LLM_NORM_RMS, il);
  13534. // Mamba2 layer
  13535. cb(cur, "ssm_in", il);
  13536. ggml_tensor * ssm_out = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
  13537. cb(ssm_out, "ssm_out", il);
  13538. // // Aggregation
  13539. cur = ggml_add(ctx0, attn_out, ssm_out);
  13540. inpSA = ggml_add(ctx0, cur, inpSA);
  13541. cb(cur, "layer_out", il);
  13542. if (il == n_layer - 1 && inp_out_ids) {
  13543. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  13544. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  13545. }
  13546. ggml_tensor * ffn_inp = inpSA;
  13547. cb(ffn_inp, "ffn_inp", il);
  13548. // feed-forward network
  13549. cur = build_norm(ffn_inp,
  13550. model.layers[il].ffn_norm, NULL,
  13551. LLM_NORM_RMS, il);
  13552. cb(cur, "ffn_norm", il);
  13553. cur = build_ffn(cur,
  13554. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  13555. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  13556. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  13557. NULL,
  13558. LLM_FFN_SILU, LLM_FFN_PAR, il);
  13559. cb(cur, "ffn_out", il);
  13560. cur = ggml_add(ctx0, cur, inpSA);
  13561. cur = build_cvec(cur, il);
  13562. cb(cur, "l_out", il);
  13563. // input for next layer
  13564. inpL = cur;
  13565. }
  13566. cur = inpL;
  13567. cur = build_norm(cur,
  13568. model.output_norm, NULL,
  13569. LLM_NORM_RMS, -1);
  13570. cb(cur, "result_norm", -1);
  13571. res->t_embd = cur;
  13572. // lm_head
  13573. cur = build_lora_mm(model.output, cur);
  13574. cb(cur, "result_output", -1);
  13575. res->t_logits = cur;
  13576. ggml_build_forward_expand(gf, cur);
  13577. }
  13578. };
  13579. struct llm_build_plamo2 : public llm_graph_context_mamba {
  13580. llm_build_plamo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
  13581. ggml_tensor * cur;
  13582. ggml_tensor * inpL;
  13583. // {n_embd, n_tokens}
  13584. inpL = build_inp_embd(model.tok_embd);
  13585. cb(inpL, "embedding_output", -1);
  13586. ggml_tensor * inp_pos = build_inp_pos();
  13587. auto * inp_hybrid = build_inp_mem_hybrid();
  13588. ggml_tensor * inp_out_ids = build_inp_out_ids();
  13589. for (int il = 0; il < n_layer; ++il) {
  13590. ggml_tensor * residual = inpL;
  13591. // ggml_graph_add_node(gf, model.layers[il].attn_norm);
  13592. // cb(model.layers[il].attn_norm, "attn_norm", il);
  13593. // pre_mixer_norm
  13594. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  13595. // check if this layer is Mamba or Attention
  13596. bool is_mamba_layer = hparams.is_recurrent(il);
  13597. if (is_mamba_layer) {
  13598. // PLaMo-2 Mamba layer
  13599. cur = build_plamo2_mamba_layer(inp_hybrid->get_recr(), cur, model, ubatch, il);
  13600. } else {
  13601. // PLaMo-2 Attention layer
  13602. cur = build_plamo2_attn_layer(inp_hybrid->get_attn(), inp_pos, cur, model, il);
  13603. }
  13604. // post_mixer_norm
  13605. cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
  13606. cb(cur, "attn_post_norm", il);
  13607. // residual connection
  13608. cur = ggml_add(ctx0, cur, residual);
  13609. cb(cur, "attn_residual", il);
  13610. residual = cur;
  13611. // pre-ffn norm
  13612. cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
  13613. cb(cur, "ffn_pre_norm", il);
  13614. // feed-forward network
  13615. cur = build_ffn(cur,
  13616. model.layers[il].ffn_up, NULL, NULL,
  13617. NULL, NULL, NULL,
  13618. model.layers[il].ffn_down, NULL, NULL,
  13619. NULL,
  13620. LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
  13621. cb(cur, "ffn_out", il);
  13622. // post ffn norm
  13623. cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
  13624. cb(cur, "ffn_post_norm", il);
  13625. if (il == n_layer - 1 && inp_out_ids) {
  13626. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  13627. residual = ggml_get_rows(ctx0, residual, inp_out_ids);
  13628. }
  13629. // residual connection
  13630. cur = ggml_add(ctx0, cur, residual);
  13631. cb(cur, "ffn_residual", il);
  13632. inpL = cur;
  13633. }
  13634. cur = inpL;
  13635. // final norm
  13636. cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
  13637. cb(cur, "result_norm", -1);
  13638. // lm_head
  13639. cur = build_lora_mm(model.output, cur);
  13640. cb(cur, "result_output", -1);
  13641. // Explicitly mark as output tensor to ensure proper backend assignment
  13642. ggml_set_output(cur);
  13643. res->t_logits = cur;
  13644. ggml_build_forward_expand(gf, cur);
  13645. }
  13646. private:
  13647. ggml_tensor * build_plamo2_attn_layer(
  13648. llm_graph_input_attn_kv * inp,
  13649. ggml_tensor * inp_pos,
  13650. ggml_tensor * cur,
  13651. const llama_model & model,
  13652. int il) {
  13653. // self-attention
  13654. {
  13655. // PLaMo-2 uses combined QKV tensor
  13656. ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
  13657. cb(qkv, "wqkv", il);
  13658. // split QKV tensor into Q, K, V
  13659. const int64_t n_embd_head_q = hparams.n_embd_head_k;
  13660. const int64_t n_embd_head_k = hparams.n_embd_head_k;
  13661. const int64_t n_embd_head_v = hparams.n_embd_head_v;
  13662. int32_t n_head_kv = hparams.n_head_kv(il);
  13663. const int64_t q_offset = 0;
  13664. const int64_t k_offset = n_embd_head_q * n_head;
  13665. const int64_t v_offset = k_offset + n_embd_head_k * n_head_kv;
  13666. ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
  13667. ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
  13668. ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, n_embd_head_v, n_head_kv, n_tokens, n_embd_head_v * sizeof(float), qkv->nb[1], v_offset * ggml_element_size(qkv));
  13669. cb(Qcur, "Qcur", il);
  13670. cb(Kcur, "Kcur", il);
  13671. cb(Vcur, "Vcur", il);
  13672. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  13673. cb(Qcur, "Qcur_normed", il);
  13674. Qcur = ggml_rope_ext(
  13675. ctx0, Qcur, inp_pos, nullptr,
  13676. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13677. ext_factor, attn_factor, beta_fast, beta_slow
  13678. );
  13679. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  13680. cb(Kcur, "Kcur_normed", il);
  13681. Kcur = ggml_rope_ext(
  13682. ctx0, Kcur, inp_pos, nullptr,
  13683. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13684. ext_factor, attn_factor, beta_fast, beta_slow
  13685. );
  13686. cur = build_attn(inp,
  13687. model.layers[il].wo, NULL,
  13688. Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
  13689. }
  13690. cb(cur, "attn_out", il);
  13691. return cur;
  13692. }
  13693. ggml_tensor * build_plamo2_mamba_layer(
  13694. llm_graph_input_rs * inp,
  13695. ggml_tensor * cur,
  13696. const llama_model & model,
  13697. const llama_ubatch & ubatch,
  13698. int il) {
  13699. const auto * mctx_cur = inp->mctx;
  13700. const auto kv_head = mctx_cur->get_head();
  13701. const int64_t d_conv = hparams.ssm_d_conv;
  13702. const int64_t d_inner = hparams.ssm_d_inner;
  13703. const int64_t d_state = hparams.ssm_d_state;
  13704. const int64_t n_heads = hparams.ssm_dt_rank;
  13705. const int64_t head_dim = d_inner / n_heads;
  13706. const int64_t n_group = hparams.ssm_n_group;
  13707. const int64_t n_seqs = ubatch.n_seqs;
  13708. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  13709. GGML_ASSERT(n_seqs != 0);
  13710. GGML_ASSERT(ubatch.equal_seqs());
  13711. GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
  13712. ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
  13713. ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
  13714. ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
  13715. conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
  13716. // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
  13717. cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
  13718. // in_proj: {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
  13719. ggml_tensor * zx = build_lora_mm(model.layers[il].ssm_in, cur);
  13720. cb(zx, "mamba_in_proj", il);
  13721. // {8192, 5, 1, 1} -> {8192, 1, 5, 1}
  13722. zx = ggml_permute(ctx0, zx, 0, 2, 1, 3);
  13723. zx = ggml_cont_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
  13724. cb(zx, "mamba_in_proj_out", il);
  13725. // split into z and x
  13726. // => {head_dim * n_heads, n_seq_tokens, n_seqs}
  13727. ggml_tensor * x = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], head_dim*ggml_element_size(zx));
  13728. x = ggml_cont_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
  13729. // x = ggml_permute(ctx0, x, 0, 2, 1, 3);
  13730. cb(x, "mamba_x_split", il);
  13731. ggml_tensor * z = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], 0);
  13732. cb(z, "mamba_z_split", il);
  13733. // conv1d
  13734. {
  13735. // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
  13736. ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
  13737. cb(conv_x, "mamba_conv1d_input", il);
  13738. // copy last (d_conv - 1) columns back into the state cache
  13739. ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs,
  13740. conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
  13741. ggml_build_forward_expand(gf,
  13742. ggml_cpy(ctx0, last_conv,
  13743. ggml_view_1d(ctx0, conv_states_all,
  13744. (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
  13745. kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
  13746. cb(conv_states_all, "mamba_conv1d_state", il);
  13747. // 1D convolution
  13748. x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
  13749. cb(x, "mamba_conv1d", il);
  13750. x = ggml_silu(ctx0, x);
  13751. cb(x, "mamba_conv1d_silu", il);
  13752. }
  13753. // SSM
  13754. {
  13755. // bcdt_proj: {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
  13756. ggml_tensor * x_bcdt = build_lora_mm(model.layers[il].ssm_x, x);
  13757. cb(x_bcdt, "mamba_bcdt_proj", il);
  13758. // split into dt, B, C
  13759. const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
  13760. ggml_tensor * B = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], 0);
  13761. ggml_tensor * C = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], ggml_element_size(x_bcdt)*d_state);
  13762. ggml_tensor * dt = ggml_view_3d(ctx0, x_bcdt, dt_dim, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], ggml_element_size(x_bcdt)*(2*d_state));
  13763. cb(B, "mamba_B_raw", il);
  13764. cb(C, "mamba_C_raw", il);
  13765. cb(dt, "mamba_dt_raw", il);
  13766. // Apply RMS norm to dt, B, C (PLaMo-2 specific)
  13767. B = build_norm(B, model.layers[il].ssm_b_norm, NULL, LLM_NORM_RMS, il);
  13768. C = build_norm(C, model.layers[il].ssm_c_norm, NULL, LLM_NORM_RMS, il);
  13769. dt = build_norm(dt, model.layers[il].ssm_dt_norm, NULL, LLM_NORM_RMS, il);
  13770. cb(B, "mamba_B_normed", il);
  13771. cb(C, "mamba_C_normed", il);
  13772. cb(dt, "mamba_dt_normed", il);
  13773. // dt_proj: {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
  13774. dt = build_lora_mm(model.layers[il].ssm_dt, dt);
  13775. dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
  13776. cb(dt, "mamba_dt_proj", il);
  13777. ggml_tensor * A = ggml_reshape_2d(ctx0, model.layers[il].ssm_a, 1, n_heads);
  13778. cb(A, "mamba_A", il);
  13779. x = ggml_view_4d(ctx0, x, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
  13780. B = ggml_view_4d(ctx0, B, d_state, 1, n_seq_tokens, n_seqs, d_state * B->nb[0], B->nb[1], B->nb[2], 0);
  13781. C = ggml_view_4d(ctx0, C, d_state, 1, n_seq_tokens, n_seqs, d_state * C->nb[0], C->nb[1], C->nb[2], 0);
  13782. // use the states and the indices provided by build_recurrent_state
  13783. // (this is necessary in order to properly use the states before they are overwritten,
  13784. // while avoiding to make unnecessary copies of the states)
  13785. auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
  13786. ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_heads, mctx_cur->get_size());
  13787. // Custom operator to optimize the parallel associative scan
  13788. // as described in the Annex D of the Mamba paper.
  13789. // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
  13790. return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
  13791. };
  13792. ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
  13793. cb(y_ssm, "mamba_ssm_scan", il);
  13794. // store last states
  13795. ggml_build_forward_expand(gf,
  13796. ggml_cpy(ctx0,
  13797. ggml_view_1d(ctx0, y_ssm, n_heads*head_dim*d_state*n_seqs, n_heads*head_dim*n_seq_tokens*n_seqs*ggml_element_size(y_ssm)),
  13798. ggml_view_1d(ctx0, ssm_states_all, n_heads*head_dim*d_state*n_seqs, kv_head*n_seqs*n_heads*head_dim*d_state*ggml_element_size(ssm_states_all))));
  13799. cb(ssm_states_all, "mamba_ssm_states", il);
  13800. ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
  13801. cb(y, "mamba_y_view", il);
  13802. // Add D parameter and apply gating with z
  13803. // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
  13804. ggml_tensor * D = ggml_reshape_2d(ctx0, model.layers[il].ssm_d, 1, n_heads);
  13805. y = ggml_add(ctx0, y, ggml_mul(ctx0, x, D));
  13806. cb(y, "mamba_y_add_d", il);
  13807. y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
  13808. cb(y, "mamba_y_swiglu_z", il);
  13809. // out_proj: {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
  13810. y = ggml_view_3d(ctx0, y, head_dim * n_heads, n_seq_tokens, n_seqs, y->nb[2], y->nb[3], 0);
  13811. cur = build_lora_mm(model.layers[il].ssm_out, y);
  13812. cb(cur, "mamba_out_proj", il);
  13813. }
  13814. // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
  13815. cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
  13816. cb(cur, "mamba_out", il);
  13817. return cur;
  13818. }
  13819. };
  13820. struct llm_build_arcee : public llm_graph_context {
  13821. llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  13822. const int64_t n_embd_head = hparams.n_embd_head_v;
  13823. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  13824. GGML_ASSERT(n_embd_head == hparams.n_rot);
  13825. ggml_tensor * cur;
  13826. ggml_tensor * inpL;
  13827. inpL = build_inp_embd(model.tok_embd);
  13828. // inp_pos - contains the positions
  13829. ggml_tensor * inp_pos = build_inp_pos();
  13830. auto * inp_attn = build_attn_inp_kv();
  13831. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  13832. ggml_tensor * inp_out_ids = build_inp_out_ids();
  13833. for (int il = 0; il < n_layer; ++il) {
  13834. ggml_tensor * inpSA = inpL;
  13835. // norm
  13836. cur = build_norm(inpL,
  13837. model.layers[il].attn_norm, NULL,
  13838. LLM_NORM_RMS, il);
  13839. cb(cur, "attn_norm", il);
  13840. // self-attention
  13841. {
  13842. // rope freq factors for llama3; may return nullptr for llama2 and other models
  13843. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  13844. // compute Q and K and RoPE them
  13845. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  13846. cb(Qcur, "Qcur", il);
  13847. if (model.layers[il].bq) {
  13848. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  13849. cb(Qcur, "Qcur", il);
  13850. }
  13851. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  13852. cb(Kcur, "Kcur", il);
  13853. if (model.layers[il].bk) {
  13854. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  13855. cb(Kcur, "Kcur", il);
  13856. }
  13857. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  13858. cb(Vcur, "Vcur", il);
  13859. if (model.layers[il].bv) {
  13860. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  13861. cb(Vcur, "Vcur", il);
  13862. }
  13863. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  13864. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  13865. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  13866. Qcur = ggml_rope_ext(
  13867. ctx0, Qcur, inp_pos, rope_factors,
  13868. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13869. ext_factor, attn_factor, beta_fast, beta_slow
  13870. );
  13871. Kcur = ggml_rope_ext(
  13872. ctx0, Kcur, inp_pos, rope_factors,
  13873. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13874. ext_factor, attn_factor, beta_fast, beta_slow
  13875. );
  13876. cb(Qcur, "Qcur", il);
  13877. cb(Kcur, "Kcur", il);
  13878. cb(Vcur, "Vcur", il);
  13879. cur = build_attn(inp_attn,
  13880. model.layers[il].wo, model.layers[il].bo,
  13881. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  13882. cb(cur, "attn_out", il);
  13883. }
  13884. if (il == n_layer - 1 && inp_out_ids) {
  13885. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  13886. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  13887. }
  13888. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  13889. cb(ffn_inp, "ffn_inp", il);
  13890. // feed-forward network
  13891. // ARCEE uses relu^2 instead of silu
  13892. cur = build_norm(ffn_inp,
  13893. model.layers[il].ffn_norm, NULL,
  13894. LLM_NORM_RMS, il);
  13895. cb(cur, "ffn_norm", il);
  13896. cur = build_ffn(cur,
  13897. model.layers[il].ffn_up, NULL, NULL,
  13898. NULL, NULL, NULL,
  13899. model.layers[il].ffn_down, NULL, NULL,
  13900. NULL,
  13901. LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
  13902. cb(cur, "ffn_out", il);
  13903. cur = ggml_add(ctx0, cur, ffn_inp);
  13904. cb(cur, "ffn_out", il);
  13905. cur = build_cvec(cur, il);
  13906. cb(cur, "l_out", il);
  13907. // input for next layer
  13908. inpL = cur;
  13909. }
  13910. cur = inpL;
  13911. cur = build_norm(cur,
  13912. model.output_norm, NULL,
  13913. LLM_NORM_RMS, -1);
  13914. cb(cur, "result_norm", -1);
  13915. res->t_embd = cur;
  13916. // lm_head
  13917. cur = build_lora_mm(model.output, cur);
  13918. cb(cur, "result_output", -1);
  13919. res->t_logits = cur;
  13920. ggml_build_forward_expand(gf, cur);
  13921. }
  13922. };
  13923. struct llm_build_hunyuan_moe : public llm_graph_context {
  13924. llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  13925. const int64_t n_embd_head = hparams.n_embd_head_v;
  13926. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  13927. GGML_ASSERT(n_embd_head == hparams.n_rot);
  13928. ggml_tensor * cur;
  13929. ggml_tensor * inpL;
  13930. inpL = build_inp_embd(model.tok_embd);
  13931. // inp_pos - contains the positions
  13932. ggml_tensor * inp_pos = build_inp_pos();
  13933. auto * inp_attn = build_attn_inp_kv();
  13934. const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
  13935. ggml_tensor * inp_out_ids = build_inp_out_ids();
  13936. for (int il = 0; il < n_layer; ++il) {
  13937. ggml_tensor * inpSA = inpL;
  13938. // norm
  13939. cur = build_norm(inpL,
  13940. model.layers[il].attn_norm, NULL,
  13941. LLM_NORM_RMS, il);
  13942. cb(cur, "attn_norm", il);
  13943. // self-attention
  13944. {
  13945. // rope freq factors for llama3; may return nullptr for llama2 and other models
  13946. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  13947. // compute Q and K and RoPE them
  13948. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  13949. cb(Qcur, "Qcur", il);
  13950. if (model.layers[il].bq) {
  13951. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  13952. cb(Qcur, "Qcur", il);
  13953. }
  13954. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  13955. cb(Kcur, "Kcur", il);
  13956. if (model.layers[il].bk) {
  13957. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  13958. cb(Kcur, "Kcur", il);
  13959. }
  13960. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  13961. cb(Vcur, "Vcur", il);
  13962. if (model.layers[il].bv) {
  13963. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  13964. cb(Vcur, "Vcur", il);
  13965. }
  13966. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  13967. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  13968. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  13969. Qcur = ggml_rope_ext(
  13970. ctx0, Qcur, inp_pos, rope_factors,
  13971. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13972. ext_factor, attn_factor, beta_fast, beta_slow
  13973. );
  13974. cb(Qcur, "Qcur", il);
  13975. cb(Kcur, "Kcur", il);
  13976. cb(Vcur, "Vcur", il);
  13977. Kcur = ggml_rope_ext(
  13978. ctx0, Kcur, inp_pos, rope_factors,
  13979. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13980. ext_factor, attn_factor, beta_fast, beta_slow
  13981. );
  13982. Kcur = build_norm(Kcur,
  13983. model.layers[il].attn_k_norm, nullptr,
  13984. LLM_NORM_RMS, il);
  13985. cb(Kcur, "Kcur_norm", il);
  13986. Qcur = build_norm(Qcur,
  13987. model.layers[il].attn_q_norm, nullptr,
  13988. LLM_NORM_RMS, il);
  13989. cb(Qcur, "Qcur_norm", il);
  13990. cur = build_attn(inp_attn,
  13991. model.layers[il].wo, model.layers[il].bo,
  13992. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  13993. cb(cur, "attn_out", il);
  13994. }
  13995. if (il == n_layer - 1 && inp_out_ids) {
  13996. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  13997. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  13998. }
  13999. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  14000. cb(ffn_inp, "ffn_inp", il);
  14001. cur = build_norm(ffn_inp,
  14002. model.layers[il].ffn_norm, NULL,
  14003. LLM_NORM_RMS, il);
  14004. cb(cur, "ffn_norm", il);
  14005. // feed-forward network (non-MoE)
  14006. ggml_tensor * cur_mlp = build_ffn(cur,
  14007. model.layers[il].ffn_up_shexp, NULL, NULL,
  14008. model.layers[il].ffn_gate_shexp, NULL, NULL,
  14009. model.layers[il].ffn_down_shexp, NULL, NULL,
  14010. NULL,
  14011. LLM_FFN_SILU, LLM_FFN_PAR, il);
  14012. cb(cur_mlp, "ffn_mlp", il);
  14013. // MoE branch
  14014. ggml_tensor * cur_moe = build_moe_ffn(cur,
  14015. model.layers[il].ffn_gate_inp,
  14016. model.layers[il].ffn_up_exps,
  14017. model.layers[il].ffn_gate_exps,
  14018. model.layers[il].ffn_down_exps,
  14019. nullptr,
  14020. n_expert, n_expert_used,
  14021. LLM_FFN_SILU,
  14022. true, // norm_topk_prob
  14023. false,
  14024. 0.0,
  14025. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  14026. il);
  14027. cb(cur_moe, "ffn_moe_out", il);
  14028. ggml_tensor * ffn_out = ggml_add(ctx0, cur_moe, cur_mlp);
  14029. cb(ffn_out, "ffn_out", il);
  14030. cur = ggml_add(ctx0, ffn_out, ffn_inp);
  14031. cur = build_cvec(cur, il);
  14032. cb(cur, "l_out", il);
  14033. // input for next layer
  14034. inpL = cur;
  14035. }
  14036. cur = inpL;
  14037. cur = build_norm(cur,
  14038. model.output_norm, NULL,
  14039. LLM_NORM_RMS, -1);
  14040. cb(cur, "result_norm", -1);
  14041. res->t_embd = cur;
  14042. // lm_head
  14043. cur = build_lora_mm(model.output, cur);
  14044. cb(cur, "result_output", -1);
  14045. res->t_logits = cur;
  14046. ggml_build_forward_expand(gf, cur);
  14047. }
  14048. };
  14049. struct llm_build_hunyuan_dense : public llm_graph_context {
  14050. llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  14051. const int64_t n_embd_head = hparams.n_embd_head_v;
  14052. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  14053. GGML_ASSERT(n_embd_head == hparams.n_rot);
  14054. ggml_tensor * cur;
  14055. ggml_tensor * inpL;
  14056. inpL = build_inp_embd(model.tok_embd);
  14057. // inp_pos - contains the positions
  14058. ggml_tensor * inp_pos = build_inp_pos();
  14059. auto * inp_attn = build_attn_inp_kv();
  14060. const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
  14061. ggml_tensor * inp_out_ids = build_inp_out_ids();
  14062. for (int il = 0; il < n_layer; ++il) {
  14063. ggml_tensor * inpSA = inpL;
  14064. // norm
  14065. cur = build_norm(inpL,
  14066. model.layers[il].attn_norm, NULL,
  14067. LLM_NORM_RMS, il);
  14068. cb(cur, "attn_norm", il);
  14069. // self-attention
  14070. {
  14071. // rope freq factors for llama3; may return nullptr for llama2 and other models
  14072. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  14073. // compute Q and K and RoPE them
  14074. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  14075. cb(Qcur, "Qcur", il);
  14076. if (model.layers[il].bq) {
  14077. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  14078. cb(Qcur, "Qcur", il);
  14079. }
  14080. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  14081. cb(Kcur, "Kcur", il);
  14082. if (model.layers[il].bk) {
  14083. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  14084. cb(Kcur, "Kcur", il);
  14085. }
  14086. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  14087. cb(Vcur, "Vcur", il);
  14088. if (model.layers[il].bv) {
  14089. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  14090. cb(Vcur, "Vcur", il);
  14091. }
  14092. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  14093. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  14094. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  14095. Qcur = ggml_rope_ext(
  14096. ctx0, Qcur, inp_pos, rope_factors,
  14097. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14098. ext_factor, attn_factor, beta_fast, beta_slow
  14099. );
  14100. cb(Qcur, "Qcur", il);
  14101. cb(Kcur, "Kcur", il);
  14102. cb(Vcur, "Vcur", il);
  14103. Kcur = ggml_rope_ext(
  14104. ctx0, Kcur, inp_pos, rope_factors,
  14105. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14106. ext_factor, attn_factor, beta_fast, beta_slow
  14107. );
  14108. Kcur = build_norm(Kcur,
  14109. model.layers[il].attn_k_norm, nullptr,
  14110. LLM_NORM_RMS, il);
  14111. cb(Kcur, "Kcur_norm", il);
  14112. Qcur = build_norm(Qcur,
  14113. model.layers[il].attn_q_norm, nullptr,
  14114. LLM_NORM_RMS, il);
  14115. cb(Qcur, "Qcur_norm", il);
  14116. cur = build_attn(inp_attn,
  14117. model.layers[il].wo, model.layers[il].bo,
  14118. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  14119. cb(cur, "attn_out", il);
  14120. }
  14121. if (il == n_layer - 1 && inp_out_ids) {
  14122. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  14123. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  14124. }
  14125. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  14126. cb(ffn_inp, "ffn_inp", il);
  14127. cur = build_norm(ffn_inp,
  14128. model.layers[il].ffn_norm, NULL,
  14129. LLM_NORM_RMS, il);
  14130. cb(cur, "ffn_norm", il);
  14131. // feed-forward network (non-MoE)
  14132. ggml_tensor * cur_mlp = build_ffn(cur,
  14133. model.layers[il].ffn_up, NULL, NULL,
  14134. model.layers[il].ffn_gate, NULL, NULL,
  14135. model.layers[il].ffn_down, NULL, NULL,
  14136. NULL,
  14137. LLM_FFN_SILU, LLM_FFN_PAR, il);
  14138. cb(cur_mlp, "ffn_out", il);
  14139. cur = ggml_add(ctx0, cur_mlp, ffn_inp);
  14140. cur = build_cvec(cur, il);
  14141. cb(cur, "l_out", il);
  14142. // input for next layer
  14143. inpL = cur;
  14144. }
  14145. cur = inpL;
  14146. cur = build_norm(cur,
  14147. model.output_norm, NULL,
  14148. LLM_NORM_RMS, -1);
  14149. cb(cur, "result_norm", -1);
  14150. res->t_embd = cur;
  14151. // lm_head
  14152. cur = build_lora_mm(model.output, cur);
  14153. cb(cur, "result_output", -1);
  14154. res->t_logits = cur;
  14155. ggml_build_forward_expand(gf, cur);
  14156. }
  14157. };
  14158. struct llm_build_smollm3 : public llm_graph_context {
  14159. llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  14160. const int64_t n_embd_head = hparams.n_embd_head_v;
  14161. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  14162. GGML_ASSERT(n_embd_head == hparams.n_rot);
  14163. ggml_tensor * cur;
  14164. ggml_tensor * inpL;
  14165. inpL = build_inp_embd(model.tok_embd);
  14166. // inp_pos - contains the positions
  14167. ggml_tensor * inp_pos = build_inp_pos();
  14168. auto * inp_attn = build_attn_inp_kv();
  14169. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  14170. ggml_tensor * inp_out_ids = build_inp_out_ids();
  14171. for (int il = 0; il < n_layer; ++il) {
  14172. ggml_tensor * inpSA = inpL;
  14173. const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
  14174. // norm
  14175. cur = build_norm(inpL,
  14176. model.layers[il].attn_norm, NULL,
  14177. LLM_NORM_RMS, il);
  14178. cb(cur, "attn_norm", il);
  14179. // self-attention
  14180. {
  14181. // compute Q and K and RoPE them
  14182. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  14183. cb(Qcur, "Qcur", il);
  14184. if (model.layers[il].bq) {
  14185. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  14186. cb(Qcur, "Qcur", il);
  14187. }
  14188. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  14189. cb(Kcur, "Kcur", il);
  14190. if (model.layers[il].bk) {
  14191. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  14192. cb(Kcur, "Kcur", il);
  14193. }
  14194. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  14195. cb(Vcur, "Vcur", il);
  14196. if (model.layers[il].bv) {
  14197. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  14198. cb(Vcur, "Vcur", il);
  14199. }
  14200. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  14201. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  14202. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  14203. if (use_rope) {
  14204. Qcur = ggml_rope_ext(
  14205. ctx0, Qcur, inp_pos, nullptr,
  14206. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14207. ext_factor, attn_factor, beta_fast, beta_slow
  14208. );
  14209. Kcur = ggml_rope_ext(
  14210. ctx0, Kcur, inp_pos, nullptr,
  14211. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14212. ext_factor, attn_factor, beta_fast, beta_slow
  14213. );
  14214. }
  14215. cb(Qcur, "Qcur", il);
  14216. cb(Kcur, "Kcur", il);
  14217. cb(Vcur, "Vcur", il);
  14218. cur = build_attn(inp_attn,
  14219. model.layers[il].wo, model.layers[il].bo,
  14220. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  14221. cb(cur, "attn_out", il);
  14222. }
  14223. if (il == n_layer - 1 && inp_out_ids) {
  14224. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  14225. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  14226. }
  14227. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  14228. cb(ffn_inp, "ffn_inp", il);
  14229. // feed-forward network
  14230. {
  14231. cur = build_norm(ffn_inp,
  14232. model.layers[il].ffn_norm, NULL,
  14233. LLM_NORM_RMS, il);
  14234. cb(cur, "ffn_norm", il);
  14235. cur = build_ffn(cur,
  14236. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  14237. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  14238. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  14239. NULL,
  14240. LLM_FFN_SILU, LLM_FFN_PAR, il);
  14241. cb(cur, "ffn_out", il);
  14242. }
  14243. cur = ggml_add(ctx0, cur, ffn_inp);
  14244. cb(cur, "ffn_out", il);
  14245. cur = build_cvec(cur, il);
  14246. cb(cur, "l_out", il);
  14247. // input for next layer
  14248. inpL = cur;
  14249. }
  14250. cur = inpL;
  14251. cur = build_norm(cur,
  14252. model.output_norm, NULL,
  14253. LLM_NORM_RMS, -1);
  14254. cb(cur, "result_norm", -1);
  14255. res->t_embd = cur;
  14256. // lm_head
  14257. cur = build_lora_mm(model.output, cur);
  14258. cb(cur, "result_output", -1);
  14259. res->t_logits = cur;
  14260. ggml_build_forward_expand(gf, cur);
  14261. }
  14262. };
  14263. struct llm_build_openai_moe_iswa : public llm_graph_context {
  14264. llm_build_openai_moe_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  14265. ggml_tensor * cur;
  14266. ggml_tensor * inpL;
  14267. inpL = build_inp_embd(model.tok_embd);
  14268. // inp_pos - contains the positions
  14269. ggml_tensor * inp_pos = build_inp_pos();
  14270. auto * inp_attn = build_attn_inp_kv_iswa();
  14271. for (int il = 0; il < n_layer; ++il) {
  14272. ggml_tensor * inpSA = inpL;
  14273. // norm
  14274. cur = build_norm(inpL,
  14275. model.layers[il].attn_norm, nullptr,
  14276. LLM_NORM_RMS, il);
  14277. cb(cur, "attn_norm", il);
  14278. // self-attention
  14279. {
  14280. // compute Q and K and RoPE them
  14281. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  14282. cb(Qcur, "Qcur", il);
  14283. if (model.layers[il].bq) {
  14284. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  14285. cb(Qcur, "Qcur", il);
  14286. }
  14287. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  14288. cb(Kcur, "Kcur", il);
  14289. if (model.layers[il].bk) {
  14290. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  14291. cb(Kcur, "Kcur", il);
  14292. }
  14293. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  14294. cb(Vcur, "Vcur", il);
  14295. if (model.layers[il].bv) {
  14296. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  14297. cb(Vcur, "Vcur", il);
  14298. }
  14299. Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
  14300. Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
  14301. Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
  14302. Qcur = ggml_rope_ext(
  14303. ctx0, Qcur, inp_pos, nullptr,
  14304. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14305. ext_factor, attn_factor, beta_fast, beta_slow
  14306. );
  14307. Kcur = ggml_rope_ext(
  14308. ctx0, Kcur, inp_pos, nullptr,
  14309. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14310. ext_factor, attn_factor, beta_fast, beta_slow
  14311. );
  14312. cb(Qcur, "Qcur", il);
  14313. cb(Kcur, "Kcur", il);
  14314. cb(Vcur, "Vcur", il);
  14315. cur = build_attn(inp_attn,
  14316. model.layers[il].wo, model.layers[il].bo,
  14317. Qcur, Kcur, Vcur, nullptr, model.layers[il].attn_sinks, nullptr, 1.0f/sqrtf(float(n_rot)), il);
  14318. cb(cur, "attn_out", il);
  14319. }
  14320. if (il == n_layer - 1) {
  14321. // skip computing output for unused tokens
  14322. ggml_tensor * inp_out_ids = build_inp_out_ids();
  14323. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  14324. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  14325. }
  14326. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  14327. cb(ffn_inp, "ffn_inp", il);
  14328. cur = ffn_inp;
  14329. cur = build_norm(cur,
  14330. model.layers[il].attn_post_norm, nullptr,
  14331. LLM_NORM_RMS, il);
  14332. cb(cur, "attn_post_norm", il);
  14333. // MoE branch
  14334. cur = build_moe_ffn(cur,
  14335. model.layers[il].ffn_gate_inp, model.layers[il].ffn_gate_inp_b,
  14336. model.layers[il].ffn_up_exps, model.layers[il].ffn_up_exps_b,
  14337. model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b,
  14338. model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b,
  14339. nullptr,
  14340. n_expert, n_expert_used,
  14341. LLM_FFN_SWIGLU_OAI_MOE, false,
  14342. false, 0.0,
  14343. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
  14344. il);
  14345. cb(cur, "ffn_moe_out", il);
  14346. cur = ggml_add(ctx0, cur, ffn_inp);
  14347. cur = build_cvec(cur, il);
  14348. cb(cur, "l_out", il);
  14349. // input for next layer
  14350. inpL = cur;
  14351. }
  14352. cur = inpL;
  14353. cur = build_norm(cur,
  14354. model.output_norm, NULL,
  14355. LLM_NORM_RMS, -1);
  14356. cb(cur, "result_norm", -1);
  14357. res->t_embd = cur;
  14358. // lm_head
  14359. cur = build_lora_mm(model.output, cur);
  14360. cb(cur, "result_output", -1);
  14361. res->t_logits = cur;
  14362. ggml_build_forward_expand(gf, cur);
  14363. }
  14364. };
  14365. struct llm_build_lfm2 : public llm_graph_context {
  14366. const llama_model & model;
  14367. llm_build_lfm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
  14368. ggml_tensor * cur = build_inp_embd(model.tok_embd);
  14369. cb(cur, "model.embed_tokens", -1);
  14370. ggml_tensor * inp_pos = build_inp_pos();
  14371. auto * inp_hybrid = build_inp_mem_hybrid();
  14372. ggml_tensor * inp_out_ids = build_inp_out_ids();
  14373. for (int il = 0; il < n_layer; ++il) {
  14374. auto * prev_cur = cur;
  14375. cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  14376. cb(cur, "model.layers.{}.operator_norm", il);
  14377. cur = hparams.is_recurrent(il) ?
  14378. build_shortconv_block(cur, inp_hybrid->get_recr(), il) :
  14379. build_attn_block(cur, inp_pos, inp_hybrid->get_attn(), il) ;
  14380. if (il == n_layer - 1 && inp_out_ids) {
  14381. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  14382. prev_cur = ggml_get_rows(ctx0, prev_cur, inp_out_ids);
  14383. }
  14384. cur = ggml_add(ctx0, prev_cur, cur);
  14385. cur = ggml_add(ctx0, cur, build_feed_forward(cur, il));
  14386. }
  14387. cur = build_norm(cur, model.tok_norm, NULL, LLM_NORM_RMS, -1);
  14388. cb(cur, "model.embedding_norm", -1);
  14389. res->t_embd = cur;
  14390. cur = build_lora_mm(model.output, cur);
  14391. cb(cur, "lm_head", -1);
  14392. res->t_logits = cur;
  14393. ggml_build_forward_expand(gf, cur);
  14394. }
  14395. ggml_tensor * build_feed_forward(ggml_tensor * cur,
  14396. int il) const {
  14397. cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
  14398. cb(cur, "model.layers.{}.ffn_norm", il);
  14399. GGML_ASSERT(!model.layers[il].ffn_up_b);
  14400. GGML_ASSERT(!model.layers[il].ffn_gate_b);
  14401. GGML_ASSERT(!model.layers[il].ffn_down_b);
  14402. cur = build_ffn(cur,
  14403. model.layers[il].ffn_up, NULL, NULL,
  14404. model.layers[il].ffn_gate, NULL, NULL,
  14405. model.layers[il].ffn_down, NULL, NULL,
  14406. NULL,
  14407. LLM_FFN_SILU, LLM_FFN_PAR, il);
  14408. cb(cur, "model.layers.{}.feed_forward.w2", il);
  14409. return cur;
  14410. }
  14411. ggml_tensor * build_attn_block(ggml_tensor * cur,
  14412. ggml_tensor * inp_pos,
  14413. llm_graph_input_attn_kv * inp_attn,
  14414. int il) const {
  14415. GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
  14416. auto const n_embd_head = hparams.n_embd_head_v;
  14417. auto const n_head_kv = hparams.n_head_kv(il);
  14418. auto * q = build_lora_mm(model.layers[il].wq, cur);
  14419. cb(q, "model.layers.{}.self_attn.q_proj", il);
  14420. auto * k = build_lora_mm(model.layers[il].wk, cur);
  14421. cb(k, "model.layers.{}.self_attn.k_proj", il);
  14422. auto * v = build_lora_mm(model.layers[il].wv, cur);
  14423. cb(v, "model.layers.{}.self_attn.v_proj", il);
  14424. q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens);
  14425. k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head_kv, n_tokens);
  14426. v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens);
  14427. // qk norm
  14428. q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  14429. cb(q, "model.layers.{}.self_attn.q_layernorm", il);
  14430. k = build_norm(k, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  14431. cb(k, "model.layers.{}.self_attn.k_layernorm", il);
  14432. // RoPE
  14433. q = ggml_rope_ext(
  14434. ctx0, q, inp_pos, nullptr,
  14435. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14436. ext_factor, attn_factor, beta_fast, beta_slow
  14437. );
  14438. k = ggml_rope_ext(
  14439. ctx0, k, inp_pos, nullptr,
  14440. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14441. ext_factor, attn_factor, beta_fast, beta_slow
  14442. );
  14443. cur = build_attn(inp_attn, model.layers[il].wo, NULL,
  14444. q, k, v, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  14445. cb(cur, "model.layers.{}.self_attn.out_proj", il);
  14446. return cur;
  14447. }
  14448. ggml_tensor * build_shortconv_block(ggml_tensor * cur,
  14449. llm_graph_input_rs * inp_recr,
  14450. int il) {
  14451. const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
  14452. const uint32_t kv_head = mctx_cur->get_head();
  14453. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  14454. const int64_t n_seqs = ubatch.n_seqs;
  14455. GGML_ASSERT(n_seqs != 0);
  14456. GGML_ASSERT(ubatch.equal_seqs());
  14457. GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
  14458. GGML_ASSERT(hparams.n_shortconv_l_cache > 1);
  14459. const uint32_t d_conv = hparams.n_shortconv_l_cache - 1;
  14460. // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
  14461. cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
  14462. auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur);
  14463. cb(bcx, "model.layers.{}.conv.in_proj", il);
  14464. constexpr auto n_chunks = 3;
  14465. GGML_ASSERT(bcx->ne[0] % n_chunks == 0);
  14466. auto const chunk_size = bcx->ne[0] / n_chunks;
  14467. auto * b = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 0*chunk_size*ggml_element_size(bcx));
  14468. auto * c = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 1*chunk_size*ggml_element_size(bcx));
  14469. auto * x = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 2*chunk_size*ggml_element_size(bcx));
  14470. auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x));
  14471. // read conv state
  14472. auto * conv_state = mctx_cur->get_r_l(il);
  14473. auto * conv_rs = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs);
  14474. auto * conv = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs);
  14475. bx = ggml_concat(ctx0, conv, bx, 0);
  14476. GGML_ASSERT(bx->ne[0] > conv->ne[0]);
  14477. // last d_conv columns is a new conv state
  14478. auto * new_conv = ggml_view_3d(ctx0, bx, conv->ne[0], bx->ne[1], bx->ne[2], bx->nb[1], bx->nb[2], (bx->ne[0] - conv->ne[0])*ggml_element_size(bx));
  14479. GGML_ASSERT(ggml_are_same_shape(conv, new_conv));
  14480. // write new conv conv state
  14481. ggml_build_forward_expand(
  14482. gf,
  14483. ggml_cpy(
  14484. ctx0,
  14485. new_conv,
  14486. ggml_view_1d(
  14487. ctx0,
  14488. conv_state,
  14489. ggml_nelements(new_conv),
  14490. kv_head*d_conv*n_embd*ggml_element_size(new_conv)
  14491. )
  14492. )
  14493. );
  14494. auto * conv_kernel = model.layers[il].shortconv.conv;
  14495. auto * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel);
  14496. cb(conv_out, "model.layers.{}.conv.conv", il);
  14497. auto * y = ggml_mul(ctx0, c, conv_out);
  14498. y = build_lora_mm(model.layers[il].shortconv.out_proj, y);
  14499. cb(y, "model.layers.{}.conv.out_proj", il);
  14500. // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
  14501. y = ggml_reshape_2d(ctx0, y, y->ne[0], n_seq_tokens * n_seqs);
  14502. return y;
  14503. }
  14504. };
  14505. struct llm_build_seed_oss : public llm_graph_context {
  14506. llm_build_seed_oss(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  14507. const int64_t n_embd_head = hparams.n_embd_head_v;
  14508. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  14509. GGML_ASSERT(n_embd_head == hparams.n_rot);
  14510. ggml_tensor * cur;
  14511. ggml_tensor * inpL;
  14512. inpL = build_inp_embd(model.tok_embd);
  14513. // inp_pos - contains the positions
  14514. ggml_tensor * inp_pos = build_inp_pos();
  14515. auto * inp_attn = build_attn_inp_kv();
  14516. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  14517. ggml_tensor * inp_out_ids = build_inp_out_ids();
  14518. for (int il = 0; il < n_layer; ++il) {
  14519. ggml_tensor * inpSA = inpL;
  14520. // norm
  14521. cur = build_norm(inpL,
  14522. model.layers[il].attn_norm, NULL,
  14523. LLM_NORM_RMS, il);
  14524. cb(cur, "attn_norm", il);
  14525. // self-attention
  14526. {
  14527. // compute Q and K and RoPE them
  14528. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  14529. cb(Qcur, "Qcur", il);
  14530. if (model.layers[il].bq) {
  14531. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  14532. cb(Qcur, "Qcur", il);
  14533. }
  14534. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  14535. cb(Kcur, "Kcur", il);
  14536. if (model.layers[il].bk) {
  14537. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  14538. cb(Kcur, "Kcur", il);
  14539. }
  14540. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  14541. cb(Vcur, "Vcur", il);
  14542. if (model.layers[il].bv) {
  14543. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  14544. cb(Vcur, "Vcur", il);
  14545. }
  14546. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  14547. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  14548. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  14549. Qcur = ggml_rope_ext(
  14550. ctx0, Qcur, inp_pos, nullptr,
  14551. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14552. ext_factor, attn_factor, beta_fast, beta_slow
  14553. );
  14554. Kcur = ggml_rope_ext(
  14555. ctx0, Kcur, inp_pos, nullptr,
  14556. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14557. ext_factor, attn_factor, beta_fast, beta_slow
  14558. );
  14559. cb(Qcur, "Qcur", il);
  14560. cb(Kcur, "Kcur", il);
  14561. cb(Vcur, "Vcur", il);
  14562. cur = build_attn(inp_attn,
  14563. model.layers[il].wo, model.layers[il].bo,
  14564. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
  14565. cb(cur, "attn_out", il);
  14566. }
  14567. if (il == n_layer - 1 && inp_out_ids) {
  14568. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  14569. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  14570. }
  14571. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  14572. cb(ffn_inp, "ffn_inp", il);
  14573. // feed-forward network
  14574. cur = build_norm(ffn_inp,
  14575. model.layers[il].attn_post_norm, NULL,
  14576. LLM_NORM_RMS, il);
  14577. cb(cur, "attn_post_norm", il);
  14578. cur = build_ffn(cur,
  14579. model.layers[il].ffn_up, NULL, NULL,
  14580. model.layers[il].ffn_gate, NULL, NULL,
  14581. model.layers[il].ffn_down, NULL, NULL,
  14582. NULL,
  14583. LLM_FFN_SILU, LLM_FFN_PAR, il);
  14584. cb(cur, "ffn_out", il);
  14585. cur = ggml_add(ctx0, cur, ffn_inp);
  14586. cb(cur, "ffn_out", il);
  14587. cur = build_cvec(cur, il);
  14588. cb(cur, "l_out", il);
  14589. // input for next layer
  14590. inpL = cur;
  14591. }
  14592. cur = inpL;
  14593. cur = build_norm(cur,
  14594. model.output_norm, NULL,
  14595. LLM_NORM_RMS, -1);
  14596. cb(cur, "result_norm", -1);
  14597. res->t_embd = cur;
  14598. // lm_head
  14599. cur = build_lora_mm(model.output, cur);
  14600. cb(cur, "result_output", -1);
  14601. res->t_logits = cur;
  14602. ggml_build_forward_expand(gf, cur);
  14603. }
  14604. };
  14605. template <bool iswa>
  14606. struct llm_build_smallthinker : public llm_graph_context{
  14607. llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
  14608. const int64_t n_embd_head = hparams.n_embd_head_v;
  14609. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  14610. GGML_ASSERT(n_embd_head == hparams.n_rot);
  14611. ggml_tensor * cur;
  14612. ggml_tensor * inpL;
  14613. inpL = build_inp_embd(model.tok_embd);
  14614. // inp_pos - contains the positions
  14615. ggml_tensor * inp_pos = build_inp_pos();
  14616. using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
  14617. inp_attn_type * inp_attn = nullptr;
  14618. if constexpr (iswa) {
  14619. inp_attn = build_attn_inp_kv_iswa();
  14620. } else {
  14621. inp_attn = build_attn_inp_kv();
  14622. }
  14623. ggml_tensor * inp_out_ids = build_inp_out_ids();
  14624. for (int il = 0; il < n_layer; ++il) {
  14625. ggml_tensor * inpSA = inpL;
  14626. ggml_tensor * probs = nullptr;
  14627. probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
  14628. cb(probs, "ffn_moe_logits", il);
  14629. // norm
  14630. cur = build_norm(inpL,model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  14631. cb(cur, "attn_norm", il);
  14632. // self_attention
  14633. {
  14634. // compute Q and K and RoPE them
  14635. struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  14636. cb(Qcur, "Qcur", il);
  14637. struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  14638. cb(Kcur, "Kcur", il);
  14639. struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  14640. cb(Vcur, "Vcur", il);
  14641. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  14642. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  14643. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  14644. if (hparams.n_no_rope_layer_step == n_layer || il % hparams.n_no_rope_layer_step != 0) {
  14645. Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14646. ext_factor, attn_factor, beta_fast, beta_slow);
  14647. Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  14648. ext_factor, attn_factor, beta_fast, beta_slow);
  14649. }
  14650. cb(Qcur, "Qcur", il);
  14651. cb(Kcur, "Kcur", il);
  14652. cur = build_attn(inp_attn,
  14653. model.layers[il].wo, model.layers[il].bo,
  14654. Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
  14655. }
  14656. if (il == n_layer - 1 && inp_out_ids) {
  14657. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  14658. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  14659. probs = ggml_get_rows(ctx0, probs, inp_out_ids);
  14660. }
  14661. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  14662. cb(ffn_inp, "ffn_inp", il);
  14663. // MoE branch
  14664. cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
  14665. cb(cur, "ffn_norm", il);
  14666. ggml_tensor * ffn_out =
  14667. build_moe_ffn(cur,
  14668. nullptr,
  14669. model.layers[il].ffn_up_exps,
  14670. model.layers[il].ffn_gate_exps,
  14671. model.layers[il].ffn_down_exps,
  14672. nullptr,
  14673. n_expert, n_expert_used,
  14674. LLM_FFN_RELU, true,
  14675. false, 0.0,
  14676. static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
  14677. il, probs);
  14678. cb(ffn_out, "ffn_out", il);
  14679. cur = ffn_out;
  14680. cur = ggml_add(ctx0, cur, ffn_inp);
  14681. cur = build_cvec(cur, il);
  14682. cb(cur, "l_out", il);
  14683. // input for next layer
  14684. inpL = cur;
  14685. }
  14686. cur = inpL;
  14687. cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
  14688. cb(cur, "result_norm", -1);
  14689. // lm_head
  14690. cur = build_lora_mm(model.output, cur);
  14691. cb(cur, "result_output", -1);
  14692. res->t_logits = cur;
  14693. ggml_build_forward_expand(gf, cur);
  14694. }
  14695. };
  14696. llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
  14697. llama_memory_i * res;
  14698. switch (arch) {
  14699. // Models that need specific instantiation should be handled in the
  14700. // switch statement
  14701. case LLM_ARCH_BERT:
  14702. case LLM_ARCH_JINA_BERT_V2:
  14703. case LLM_ARCH_JINA_BERT_V3:
  14704. case LLM_ARCH_NOMIC_BERT:
  14705. case LLM_ARCH_NOMIC_BERT_MOE:
  14706. case LLM_ARCH_NEO_BERT:
  14707. case LLM_ARCH_WAVTOKENIZER_DEC:
  14708. //case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA]
  14709. case LLM_ARCH_DREAM:
  14710. case LLM_ARCH_LLADA:
  14711. {
  14712. res = nullptr;
  14713. } break;
  14714. // Models that need standard caching should rely on recurrent/hybrid
  14715. // checks
  14716. default:
  14717. {
  14718. if (llm_arch_is_recurrent(arch)) {
  14719. res = new llama_memory_recurrent(
  14720. *this,
  14721. GGML_TYPE_F32,
  14722. GGML_TYPE_F32,
  14723. cparams.offload_kqv,
  14724. std::max((uint32_t) 1, cparams.n_seq_max),
  14725. cparams.n_seq_max,
  14726. nullptr);
  14727. } else if (llm_arch_is_hybrid(arch)) {
  14728. // The main difference between hybrid architectures is the
  14729. // layer filters, so pick the right one here
  14730. llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
  14731. llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
  14732. if (arch == LLM_ARCH_FALCON_H1) {
  14733. filter_attn = [&](int32_t) { return true; };
  14734. filter_recr = [&](int32_t) { return true; };
  14735. } else if (arch == LLM_ARCH_NEMOTRON_H) {
  14736. filter_attn = [&](int32_t il) {
  14737. return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
  14738. };
  14739. filter_recr = [&](int32_t il) {
  14740. return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
  14741. };
  14742. }
  14743. const auto padding = llama_kv_cache::get_padding(cparams);
  14744. cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
  14745. res = new llama_memory_hybrid(
  14746. /* model */ *this,
  14747. /* attn_type_k */ params.type_k,
  14748. /* attn_type_v */ params.type_v,
  14749. /* attn_v_trans */ !cparams.flash_attn,
  14750. /* attn_kv_size */ cparams.n_ctx,
  14751. /* attn_n_pad */ padding,
  14752. /* attn_n_swa */ hparams.n_swa,
  14753. /* attn_swa_type */ hparams.swa_type,
  14754. /* recurrent_type_k */ GGML_TYPE_F32,
  14755. /* recurrent_type_v */ GGML_TYPE_F32,
  14756. /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
  14757. /* n_seq_max */ cparams.n_seq_max,
  14758. /* offload */ cparams.offload_kqv,
  14759. /* unified */ cparams.kv_unified,
  14760. /* filter_attn */ std::move(filter_attn),
  14761. /* filter_recr */ std::move(filter_recr));
  14762. } else {
  14763. const auto padding = llama_kv_cache::get_padding(cparams);
  14764. uint32_t n_ctx_per_stream = cparams.n_ctx;
  14765. if (!cparams.kv_unified) {
  14766. n_ctx_per_stream = (cparams.n_ctx + cparams.n_seq_max - 1)/cparams.n_seq_max;
  14767. n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
  14768. cparams.n_ctx = n_ctx_per_stream*cparams.n_seq_max;
  14769. } else {
  14770. n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
  14771. cparams.n_ctx = n_ctx_per_stream;
  14772. }
  14773. LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
  14774. llama_memory_i::layer_reuse_cb reuse = nullptr;
  14775. if (arch == LLM_ARCH_GEMMA3N) {
  14776. reuse = [&](int32_t il) {
  14777. if (il >= (int32_t) hparams.n_layer_kv_from_start) {
  14778. return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
  14779. }
  14780. return -1;
  14781. };
  14782. }
  14783. if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
  14784. GGML_ASSERT(hparams.is_swa_any());
  14785. res = new llama_kv_cache_iswa(
  14786. *this,
  14787. params.type_k,
  14788. params.type_v,
  14789. !cparams.flash_attn,
  14790. cparams.offload_kqv,
  14791. params.swa_full,
  14792. cparams.kv_unified,
  14793. n_ctx_per_stream,
  14794. cparams.n_seq_max,
  14795. cparams.n_ubatch,
  14796. padding,
  14797. nullptr,
  14798. reuse);
  14799. } else {
  14800. GGML_ASSERT(!hparams.is_swa_any());
  14801. res = new llama_kv_cache(
  14802. *this,
  14803. params.type_k,
  14804. params.type_v,
  14805. !cparams.flash_attn,
  14806. cparams.offload_kqv,
  14807. cparams.kv_unified,
  14808. n_ctx_per_stream,
  14809. cparams.n_seq_max,
  14810. padding,
  14811. hparams.n_swa,
  14812. hparams.swa_type,
  14813. nullptr,
  14814. nullptr);
  14815. }
  14816. }
  14817. }
  14818. }
  14819. return res;
  14820. }
  14821. ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
  14822. std::unique_ptr<llm_graph_context> llm;
  14823. switch (arch) {
  14824. case LLM_ARCH_LLAMA:
  14825. {
  14826. llm = std::make_unique<llm_build_llama>(*this, params);
  14827. } break;
  14828. case LLM_ARCH_LLAMA4:
  14829. {
  14830. llm = std::make_unique<llm_build_llama_iswa>(*this, params);
  14831. } break;
  14832. case LLM_ARCH_DECI:
  14833. {
  14834. llm = std::make_unique<llm_build_deci>(*this, params);
  14835. } break;
  14836. case LLM_ARCH_BAICHUAN:
  14837. {
  14838. llm = std::make_unique<llm_build_baichuan>(*this, params);
  14839. } break;
  14840. case LLM_ARCH_FALCON:
  14841. {
  14842. llm = std::make_unique<llm_build_falcon>(*this, params);
  14843. } break;
  14844. case LLM_ARCH_GROK:
  14845. {
  14846. llm = std::make_unique<llm_build_grok>(*this, params);
  14847. } break;
  14848. case LLM_ARCH_STARCODER:
  14849. {
  14850. llm = std::make_unique<llm_build_starcoder>(*this, params);
  14851. } break;
  14852. case LLM_ARCH_REFACT:
  14853. {
  14854. llm = std::make_unique<llm_build_refact>(*this, params);
  14855. } break;
  14856. case LLM_ARCH_BERT:
  14857. case LLM_ARCH_JINA_BERT_V2:
  14858. case LLM_ARCH_JINA_BERT_V3:
  14859. case LLM_ARCH_NOMIC_BERT:
  14860. case LLM_ARCH_NOMIC_BERT_MOE:
  14861. {
  14862. llm = std::make_unique<llm_build_bert>(*this, params);
  14863. } break;
  14864. case LLM_ARCH_NEO_BERT:
  14865. {
  14866. llm = std::make_unique<llm_build_neo_bert>(*this, params);
  14867. } break;
  14868. case LLM_ARCH_BLOOM:
  14869. {
  14870. llm = std::make_unique<llm_build_bloom>(*this, params);
  14871. } break;
  14872. case LLM_ARCH_MPT:
  14873. {
  14874. llm = std::make_unique<llm_build_mpt>(*this, params);
  14875. } break;
  14876. case LLM_ARCH_STABLELM:
  14877. {
  14878. llm = std::make_unique<llm_build_stablelm>(*this, params);
  14879. } break;
  14880. case LLM_ARCH_QWEN:
  14881. {
  14882. llm = std::make_unique<llm_build_qwen>(*this, params);
  14883. } break;
  14884. case LLM_ARCH_QWEN2:
  14885. {
  14886. llm = std::make_unique<llm_build_qwen2>(*this, params);
  14887. } break;
  14888. case LLM_ARCH_DREAM:
  14889. {
  14890. llm = std::make_unique<llm_build_dream>(*this, params);
  14891. }
  14892. break;
  14893. case LLM_ARCH_LLADA:
  14894. {
  14895. llm = std::make_unique<llm_build_llada>(*this, params);
  14896. }
  14897. break;
  14898. case LLM_ARCH_QWEN2VL:
  14899. {
  14900. llm = std::make_unique<llm_build_qwen2vl>(*this, params);
  14901. } break;
  14902. case LLM_ARCH_QWEN2MOE:
  14903. {
  14904. llm = std::make_unique<llm_build_qwen2moe>(*this, params);
  14905. } break;
  14906. case LLM_ARCH_QWEN3:
  14907. {
  14908. llm = std::make_unique<llm_build_qwen3>(*this, params);
  14909. } break;
  14910. case LLM_ARCH_QWEN3MOE:
  14911. {
  14912. llm = std::make_unique<llm_build_qwen3moe>(*this, params);
  14913. } break;
  14914. case LLM_ARCH_PHI2:
  14915. {
  14916. llm = std::make_unique<llm_build_phi2>(*this, params);
  14917. } break;
  14918. case LLM_ARCH_PHI3:
  14919. case LLM_ARCH_PHIMOE:
  14920. {
  14921. if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
  14922. llm = std::make_unique<llm_build_phi3<true>> (*this, params);
  14923. } else {
  14924. llm = std::make_unique<llm_build_phi3<false>>(*this, params);
  14925. }
  14926. } break;
  14927. case LLM_ARCH_PLAMO:
  14928. {
  14929. llm = std::make_unique<llm_build_plamo>(*this, params);
  14930. } break;
  14931. case LLM_ARCH_PLAMO2:
  14932. {
  14933. llm = std::make_unique<llm_build_plamo2>(*this, params);
  14934. } break;
  14935. case LLM_ARCH_GPT2:
  14936. {
  14937. llm = std::make_unique<llm_build_gpt2>(*this, params);
  14938. } break;
  14939. case LLM_ARCH_CODESHELL:
  14940. {
  14941. llm = std::make_unique<llm_build_codeshell>(*this, params);
  14942. } break;
  14943. case LLM_ARCH_ORION:
  14944. {
  14945. llm = std::make_unique<llm_build_orion>(*this, params);
  14946. } break;
  14947. case LLM_ARCH_INTERNLM2:
  14948. {
  14949. llm = std::make_unique<llm_build_internlm2>(*this, params);
  14950. } break;
  14951. case LLM_ARCH_MINICPM3:
  14952. {
  14953. llm = std::make_unique<llm_build_minicpm3>(*this, params);
  14954. } break;
  14955. case LLM_ARCH_GEMMA:
  14956. {
  14957. llm = std::make_unique<llm_build_gemma>(*this, params);
  14958. } break;
  14959. case LLM_ARCH_GEMMA2:
  14960. {
  14961. llm = std::make_unique<llm_build_gemma2_iswa>(*this, params);
  14962. } break;
  14963. case LLM_ARCH_GEMMA3:
  14964. {
  14965. llm = std::make_unique<llm_build_gemma3_iswa>(*this, params);
  14966. } break;
  14967. case LLM_ARCH_GEMMA3N:
  14968. {
  14969. llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
  14970. } break;
  14971. case LLM_ARCH_GEMMA_EMBEDDING:
  14972. {
  14973. llm = std::make_unique<llm_build_gemma_embedding_iswa>(*this, params);
  14974. } break;
  14975. case LLM_ARCH_STARCODER2:
  14976. {
  14977. llm = std::make_unique<llm_build_starcoder2>(*this, params);
  14978. } break;
  14979. case LLM_ARCH_MAMBA:
  14980. case LLM_ARCH_MAMBA2:
  14981. {
  14982. llm = std::make_unique<llm_build_mamba>(*this, params);
  14983. } break;
  14984. case LLM_ARCH_JAMBA:
  14985. {
  14986. llm = std::make_unique<llm_build_jamba>(*this, params);
  14987. } break;
  14988. case LLM_ARCH_XVERSE:
  14989. {
  14990. llm = std::make_unique<llm_build_xverse>(*this, params);
  14991. } break;
  14992. case LLM_ARCH_COMMAND_R:
  14993. {
  14994. llm = std::make_unique<llm_build_command_r>(*this, params);
  14995. } break;
  14996. case LLM_ARCH_COHERE2:
  14997. {
  14998. llm = std::make_unique<llm_build_cohere2_iswa>(*this, params);
  14999. } break;
  15000. case LLM_ARCH_DBRX:
  15001. {
  15002. llm = std::make_unique<llm_build_dbrx>(*this, params);
  15003. } break;
  15004. case LLM_ARCH_OLMO:
  15005. {
  15006. llm = std::make_unique<llm_build_olmo>(*this, params);
  15007. } break;
  15008. case LLM_ARCH_OLMO2:
  15009. {
  15010. llm = std::make_unique<llm_build_olmo2>(*this, params);
  15011. } break;
  15012. case LLM_ARCH_OLMOE:
  15013. {
  15014. llm = std::make_unique<llm_build_olmoe>(*this, params);
  15015. } break;
  15016. case LLM_ARCH_OPENELM:
  15017. {
  15018. llm = std::make_unique<llm_build_openelm>(*this, params);
  15019. } break;
  15020. case LLM_ARCH_GPTNEOX:
  15021. {
  15022. llm = std::make_unique<llm_build_gptneox>(*this, params);
  15023. } break;
  15024. case LLM_ARCH_ARCTIC:
  15025. {
  15026. llm = std::make_unique<llm_build_arctic>(*this, params);
  15027. } break;
  15028. case LLM_ARCH_DEEPSEEK:
  15029. {
  15030. llm = std::make_unique<llm_build_deepseek>(*this, params);
  15031. } break;
  15032. case LLM_ARCH_DEEPSEEK2:
  15033. {
  15034. llm = std::make_unique<llm_build_deepseek2>(*this, params);
  15035. } break;
  15036. case LLM_ARCH_CHATGLM:
  15037. {
  15038. llm = std::make_unique<llm_build_chatglm>(*this, params);
  15039. } break;
  15040. case LLM_ARCH_GLM4:
  15041. {
  15042. llm = std::make_unique<llm_build_glm4>(*this, params);
  15043. } break;
  15044. case LLM_ARCH_GLM4_MOE:
  15045. {
  15046. llm = std::make_unique<llm_build_glm4_moe>(*this, params);
  15047. } break;
  15048. case LLM_ARCH_BITNET:
  15049. {
  15050. llm = std::make_unique<llm_build_bitnet>(*this, params);
  15051. } break;
  15052. case LLM_ARCH_T5:
  15053. {
  15054. switch (params.gtype) {
  15055. case LLM_GRAPH_TYPE_ENCODER:
  15056. llm = std::make_unique<llm_build_t5_enc>(*this, params);
  15057. break;
  15058. case LLM_GRAPH_TYPE_DEFAULT:
  15059. case LLM_GRAPH_TYPE_DECODER:
  15060. llm = std::make_unique<llm_build_t5_dec>(*this, params);
  15061. break;
  15062. default:
  15063. GGML_ABORT("invalid graph type");
  15064. };
  15065. } break;
  15066. case LLM_ARCH_T5ENCODER:
  15067. {
  15068. llm = std::make_unique<llm_build_t5_enc>(*this, params);
  15069. }
  15070. break;
  15071. case LLM_ARCH_JAIS:
  15072. {
  15073. llm = std::make_unique<llm_build_jais>(*this, params);
  15074. } break;
  15075. case LLM_ARCH_NEMOTRON:
  15076. {
  15077. llm = std::make_unique<llm_build_nemotron>(*this, params);
  15078. } break;
  15079. case LLM_ARCH_NEMOTRON_H:
  15080. {
  15081. llm = std::make_unique<llm_build_nemotron_h>(*this, params);
  15082. } break;
  15083. case LLM_ARCH_EXAONE:
  15084. {
  15085. llm = std::make_unique<llm_build_exaone>(*this, params);
  15086. } break;
  15087. case LLM_ARCH_EXAONE4:
  15088. {
  15089. if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
  15090. llm = std::make_unique<llm_build_exaone4<true>>(*this, params);
  15091. } else {
  15092. llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
  15093. }
  15094. } break;
  15095. case LLM_ARCH_RWKV6:
  15096. {
  15097. llm = std::make_unique<llm_build_rwkv6>(*this, params);
  15098. } break;
  15099. case LLM_ARCH_RWKV6QWEN2:
  15100. {
  15101. llm = std::make_unique<llm_build_rwkv6qwen2>(*this, params);
  15102. } break;
  15103. case LLM_ARCH_RWKV7:
  15104. {
  15105. llm = std::make_unique<llm_build_rwkv7>(*this, params);
  15106. } break;
  15107. case LLM_ARCH_ARWKV7:
  15108. {
  15109. llm = std::make_unique<llm_build_arwkv7>(*this, params);
  15110. } break;
  15111. case LLM_ARCH_GRANITE:
  15112. case LLM_ARCH_GRANITE_MOE:
  15113. case LLM_ARCH_MINICPM:
  15114. {
  15115. llm = std::make_unique<llm_build_granite>(*this, params);
  15116. } break;
  15117. case LLM_ARCH_GRANITE_HYBRID:
  15118. {
  15119. llm = std::make_unique<llm_build_granite_hybrid>(*this, params);
  15120. } break;
  15121. case LLM_ARCH_CHAMELEON:
  15122. {
  15123. llm = std::make_unique<llm_build_chameleon>(*this, params);
  15124. } break;
  15125. case LLM_ARCH_WAVTOKENIZER_DEC:
  15126. {
  15127. llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
  15128. } break;
  15129. case LLM_ARCH_PLM:
  15130. {
  15131. llm = std::make_unique<llm_build_plm>(*this, params);
  15132. } break;
  15133. case LLM_ARCH_BAILINGMOE:
  15134. {
  15135. llm = std::make_unique<llm_build_bailingmoe>(*this, params);
  15136. } break;
  15137. case LLM_ARCH_SEED_OSS:
  15138. {
  15139. llm = std::make_unique<llm_build_seed_oss>(*this, params);
  15140. } break;
  15141. case LLM_ARCH_DOTS1:
  15142. {
  15143. llm = std::make_unique<llm_build_dots1>(*this, params);
  15144. } break;
  15145. case LLM_ARCH_ARCEE:
  15146. {
  15147. llm = std::make_unique<llm_build_arcee>(*this, params);
  15148. } break;
  15149. case LLM_ARCH_ERNIE4_5:
  15150. {
  15151. llm = std::make_unique<llm_build_ernie4_5>(*this, params);
  15152. } break;
  15153. case LLM_ARCH_ERNIE4_5_MOE:
  15154. {
  15155. llm = std::make_unique<llm_build_ernie4_5_moe>(*this, params);
  15156. } break;
  15157. case LLM_ARCH_HUNYUAN_MOE:
  15158. {
  15159. llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
  15160. } break;
  15161. case LLM_ARCH_HUNYUAN_DENSE:
  15162. {
  15163. llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
  15164. } break;
  15165. case LLM_ARCH_SMOLLM3:
  15166. {
  15167. llm = std::make_unique<llm_build_smollm3>(*this, params);
  15168. } break;
  15169. case LLM_ARCH_OPENAI_MOE:
  15170. {
  15171. llm = std::make_unique<llm_build_openai_moe_iswa>(*this, params);
  15172. } break;
  15173. case LLM_ARCH_FALCON_H1:
  15174. {
  15175. llm = std::make_unique<llm_build_falcon_h1>(*this, params);
  15176. } break;
  15177. case LLM_ARCH_LFM2:
  15178. {
  15179. llm = std::make_unique<llm_build_lfm2>(*this, params);
  15180. } break;
  15181. case LLM_ARCH_SMALLTHINKER:
  15182. {
  15183. if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
  15184. llm = std::make_unique<llm_build_smallthinker<true>> (*this, params);
  15185. } else {
  15186. llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
  15187. }
  15188. } break;
  15189. default:
  15190. GGML_ABORT("fatal error");
  15191. }
  15192. // add on pooling layer
  15193. llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
  15194. return llm->res->get_gf();
  15195. }
  15196. //
  15197. // interface implementation
  15198. //
  15199. llama_model_params llama_model_default_params() {
  15200. llama_model_params result = {
  15201. /*.devices =*/ nullptr,
  15202. /*.tensor_buft_overrides =*/ nullptr,
  15203. /*.n_gpu_layers =*/ 999,
  15204. /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
  15205. /*.main_gpu =*/ 0,
  15206. /*.tensor_split =*/ nullptr,
  15207. /*.progress_callback =*/ nullptr,
  15208. /*.progress_callback_user_data =*/ nullptr,
  15209. /*.kv_overrides =*/ nullptr,
  15210. /*.vocab_only =*/ false,
  15211. /*.use_mmap =*/ true,
  15212. /*.use_mlock =*/ false,
  15213. /*.check_tensors =*/ false,
  15214. /*.use_extra_bufts =*/ true,
  15215. };
  15216. return result;
  15217. }
  15218. const llama_vocab * llama_model_get_vocab(const llama_model * model) {
  15219. return &model->vocab;
  15220. }
  15221. void llama_free_model(llama_model * model) {
  15222. llama_model_free(model);
  15223. }
  15224. void llama_model_free(llama_model * model) {
  15225. delete model;
  15226. }
  15227. int32_t llama_model_n_ctx_train(const llama_model * model) {
  15228. return model->hparams.n_ctx_train;
  15229. }
  15230. int32_t llama_model_n_embd(const llama_model * model) {
  15231. return model->hparams.n_embd;
  15232. }
  15233. int32_t llama_model_n_layer(const llama_model * model) {
  15234. return model->hparams.n_layer;
  15235. }
  15236. int32_t llama_model_n_head(const llama_model * model) {
  15237. return model->hparams.n_head();
  15238. }
  15239. int32_t llama_model_n_head_kv(const llama_model * model) {
  15240. return model->hparams.n_head_kv();
  15241. }
  15242. int32_t llama_model_n_swa(const llama_model * model) {
  15243. return model->hparams.n_swa;
  15244. }
  15245. uint32_t llama_model_n_cls_out(const struct llama_model * model) {
  15246. return model->hparams.n_cls_out;
  15247. }
  15248. const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
  15249. if (i < model->classifier_labels.size()) {
  15250. return model->classifier_labels[i].c_str();
  15251. }
  15252. return nullptr;
  15253. }
  15254. // deprecated
  15255. int32_t llama_n_ctx_train(const llama_model * model) {
  15256. return llama_model_n_ctx_train(model);
  15257. }
  15258. // deprecated
  15259. int32_t llama_n_embd(const llama_model * model) {
  15260. return llama_model_n_embd(model);
  15261. }
  15262. // deprecated
  15263. int32_t llama_n_layer(const llama_model * model) {
  15264. return llama_model_n_layer(model);
  15265. }
  15266. // deprecated
  15267. int32_t llama_n_head(const llama_model * model) {
  15268. return llama_model_n_head(model);
  15269. }
  15270. llama_rope_type llama_model_rope_type(const llama_model * model) {
  15271. switch (model->arch) {
  15272. // these models do not use RoPE
  15273. case LLM_ARCH_GPT2:
  15274. case LLM_ARCH_GPTJ:
  15275. case LLM_ARCH_MPT:
  15276. case LLM_ARCH_REFACT:
  15277. case LLM_ARCH_BLOOM:
  15278. case LLM_ARCH_MAMBA:
  15279. case LLM_ARCH_MAMBA2:
  15280. case LLM_ARCH_JAMBA:
  15281. case LLM_ARCH_JINA_BERT_V2:
  15282. case LLM_ARCH_T5:
  15283. case LLM_ARCH_T5ENCODER:
  15284. case LLM_ARCH_JAIS:
  15285. case LLM_ARCH_RWKV6:
  15286. case LLM_ARCH_RWKV6QWEN2:
  15287. case LLM_ARCH_RWKV7:
  15288. case LLM_ARCH_ARWKV7:
  15289. case LLM_ARCH_WAVTOKENIZER_DEC:
  15290. case LLM_ARCH_NEMOTRON_H:
  15291. return LLAMA_ROPE_TYPE_NONE;
  15292. // use what we call a normal RoPE, operating on pairs of consecutive head values
  15293. case LLM_ARCH_LLAMA:
  15294. case LLM_ARCH_LLADA:
  15295. case LLM_ARCH_LLAMA4:
  15296. case LLM_ARCH_DECI:
  15297. case LLM_ARCH_BAICHUAN:
  15298. case LLM_ARCH_STARCODER:
  15299. case LLM_ARCH_INTERNLM2:
  15300. case LLM_ARCH_MINICPM:
  15301. case LLM_ARCH_XVERSE:
  15302. case LLM_ARCH_COMMAND_R:
  15303. case LLM_ARCH_COHERE2:
  15304. case LLM_ARCH_OLMO:
  15305. case LLM_ARCH_ARCTIC:
  15306. case LLM_ARCH_DEEPSEEK:
  15307. case LLM_ARCH_DEEPSEEK2:
  15308. case LLM_ARCH_PLM:
  15309. case LLM_ARCH_CHATGLM:
  15310. case LLM_ARCH_GLM4:
  15311. case LLM_ARCH_GRANITE:
  15312. case LLM_ARCH_GRANITE_MOE:
  15313. case LLM_ARCH_GRANITE_HYBRID:
  15314. case LLM_ARCH_CHAMELEON:
  15315. case LLM_ARCH_BAILINGMOE:
  15316. case LLM_ARCH_NEO_BERT:
  15317. case LLM_ARCH_SMOLLM3:
  15318. case LLM_ARCH_ARCEE:
  15319. case LLM_ARCH_ERNIE4_5:
  15320. case LLM_ARCH_ERNIE4_5_MOE:
  15321. return LLAMA_ROPE_TYPE_NORM;
  15322. // the pairs of head values are offset by n_rot/2
  15323. case LLM_ARCH_FALCON:
  15324. case LLM_ARCH_FALCON_H1:
  15325. case LLM_ARCH_GROK:
  15326. case LLM_ARCH_DBRX:
  15327. case LLM_ARCH_BERT:
  15328. case LLM_ARCH_JINA_BERT_V3:
  15329. case LLM_ARCH_NOMIC_BERT:
  15330. case LLM_ARCH_NOMIC_BERT_MOE:
  15331. case LLM_ARCH_STABLELM:
  15332. case LLM_ARCH_BITNET:
  15333. case LLM_ARCH_QWEN:
  15334. case LLM_ARCH_QWEN2:
  15335. case LLM_ARCH_DREAM:
  15336. case LLM_ARCH_QWEN2MOE:
  15337. case LLM_ARCH_QWEN3:
  15338. case LLM_ARCH_QWEN3MOE:
  15339. case LLM_ARCH_OLMO2:
  15340. case LLM_ARCH_OLMOE:
  15341. case LLM_ARCH_PHI2:
  15342. case LLM_ARCH_PHI3:
  15343. case LLM_ARCH_PHIMOE:
  15344. case LLM_ARCH_PLAMO:
  15345. case LLM_ARCH_PLAMO2:
  15346. case LLM_ARCH_GEMMA:
  15347. case LLM_ARCH_GEMMA2:
  15348. case LLM_ARCH_GEMMA3:
  15349. case LLM_ARCH_GEMMA3N:
  15350. case LLM_ARCH_GEMMA_EMBEDDING:
  15351. case LLM_ARCH_STARCODER2:
  15352. case LLM_ARCH_OPENELM:
  15353. case LLM_ARCH_GPTNEOX:
  15354. case LLM_ARCH_CODESHELL:
  15355. case LLM_ARCH_ORION:
  15356. case LLM_ARCH_NEMOTRON:
  15357. case LLM_ARCH_EXAONE:
  15358. case LLM_ARCH_EXAONE4:
  15359. case LLM_ARCH_MINICPM3:
  15360. case LLM_ARCH_DOTS1:
  15361. case LLM_ARCH_HUNYUAN_MOE:
  15362. case LLM_ARCH_OPENAI_MOE:
  15363. case LLM_ARCH_HUNYUAN_DENSE:
  15364. case LLM_ARCH_LFM2:
  15365. case LLM_ARCH_SMALLTHINKER:
  15366. case LLM_ARCH_GLM4_MOE:
  15367. case LLM_ARCH_SEED_OSS:
  15368. return LLAMA_ROPE_TYPE_NEOX;
  15369. case LLM_ARCH_QWEN2VL:
  15370. return LLAMA_ROPE_TYPE_MROPE;
  15371. // all model arches should be listed explicitly here
  15372. case LLM_ARCH_UNKNOWN:
  15373. GGML_ABORT("unknown architecture");
  15374. }
  15375. return LLAMA_ROPE_TYPE_NONE;
  15376. }
  15377. float llama_model_rope_freq_scale_train(const llama_model * model) {
  15378. return model->hparams.rope_freq_scale_train;
  15379. }
  15380. int32_t llama_model_meta_val_str(const llama_model * model, const char * key, char * buf, size_t buf_size) {
  15381. const auto & it = model->gguf_kv.find(key);
  15382. if (it == model->gguf_kv.end()) {
  15383. if (buf_size > 0) {
  15384. buf[0] = '\0';
  15385. }
  15386. return -1;
  15387. }
  15388. return snprintf(buf, buf_size, "%s", it->second.c_str());
  15389. }
  15390. int32_t llama_model_meta_count(const llama_model * model) {
  15391. return (int)model->gguf_kv.size();
  15392. }
  15393. int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
  15394. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  15395. if (buf_size > 0) {
  15396. buf[0] = '\0';
  15397. }
  15398. return -1;
  15399. }
  15400. auto it = model->gguf_kv.begin();
  15401. std::advance(it, i);
  15402. return snprintf(buf, buf_size, "%s", it->first.c_str());
  15403. }
  15404. int32_t llama_model_meta_val_str_by_index(const llama_model * model, int32_t i, char * buf, size_t buf_size) {
  15405. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  15406. if (buf_size > 0) {
  15407. buf[0] = '\0';
  15408. }
  15409. return -1;
  15410. }
  15411. auto it = model->gguf_kv.begin();
  15412. std::advance(it, i);
  15413. return snprintf(buf, buf_size, "%s", it->second.c_str());
  15414. }
  15415. int32_t llama_model_desc(const llama_model * model, char * buf, size_t buf_size) {
  15416. return snprintf(buf, buf_size, "%s", model->desc().c_str());
  15417. }
  15418. uint64_t llama_model_size(const llama_model * model) {
  15419. return model->size();
  15420. }
  15421. const char * llama_model_chat_template(const llama_model * model, const char * name) {
  15422. const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
  15423. : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
  15424. const auto & it = model->gguf_kv.find(key);
  15425. if (it == model->gguf_kv.end()) {
  15426. // one-off fix for very popular models (so we are not flooded with issues)
  15427. // do not extend this list unless absolutely necessary
  15428. // Mistral-Small-2503 does not have built-in chat template
  15429. llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
  15430. if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
  15431. return "mistral-v7-tekken";
  15432. }
  15433. return nullptr;
  15434. }
  15435. return it->second.c_str();
  15436. }
  15437. uint64_t llama_model_n_params(const llama_model * model) {
  15438. return model->n_elements();
  15439. }
  15440. bool llama_model_has_encoder(const llama_model * model) {
  15441. switch (model->arch) {
  15442. case LLM_ARCH_T5: return true;
  15443. case LLM_ARCH_T5ENCODER: return true;
  15444. default: return false;
  15445. }
  15446. }
  15447. bool llama_model_has_decoder(const llama_model * model) {
  15448. switch (model->arch) {
  15449. case LLM_ARCH_T5ENCODER: return false;
  15450. default: return true;
  15451. }
  15452. }
  15453. llama_token llama_model_decoder_start_token(const llama_model * model) {
  15454. return model->hparams.dec_start_token_id;
  15455. }
  15456. bool llama_model_is_recurrent(const llama_model * model) {
  15457. return llm_arch_is_recurrent(model->arch);
  15458. }
  15459. bool llama_model_is_diffusion(const llama_model * model) {
  15460. return llm_arch_is_diffusion(model->arch);
  15461. }
  15462. const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
  15463. return model->tensors_by_name;
  15464. }