llama-model.cpp 399 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048704970507051705270537054705570567057705870597060706170627063706470657066706770687069707070717072707370747075707670777078707970807081708270837084708570867087708870897090709170927093709470957096709770987099710071017102710371047105710671077108710971107111711271137114711571167117711871197120712171227123712471257126712771287129713071317132713371347135713671377138713971407141714271437144714571467147714871497150715171527153715471557156715771587159716071617162716371647165716671677168716971707171717271737174717571767177717871797180718171827183718471857186718771887189719071917192719371947195719671977198719972007201720272037204720572067207720872097210721172127213721472157216721772187219722072217222722372247225722672277228722972307231723272337234723572367237723872397240724172427243724472457246724772487249725072517252725372547255725672577258725972607261726272637264726572667267726872697270727172727273727472757276727772787279728072817282728372847285728672877288728972907291729272937294729572967297729872997300730173027303
  1. #include "llama-model.h"
  2. #include "llama-impl.h"
  3. #include "llama-mmap.h"
  4. #include "llama-batch.h"
  5. #include "llama-cparams.h"
  6. #include "llama-model-loader.h"
  7. #include "llama-kv-cache.h"
  8. #include "llama-kv-cache-iswa.h"
  9. #include "llama-memory-hybrid.h"
  10. #include "llama-memory-recurrent.h"
  11. #include "ggml-cpp.h"
  12. #include "models/llm_graph_context_mamba.h"
  13. #include "models/llm_build_arcee.h"
  14. #include "models/llm_build_arctic.h"
  15. #include "models/llm_build_baichuan.h"
  16. #include "models/llm_build_bailingmoe.h"
  17. #include "models/llm_build_bert.h"
  18. #include "models/llm_build_bitnet.h"
  19. #include "models/llm_build_bloom.h"
  20. #include "models/llm_build_chameleon.h"
  21. #include "models/llm_build_chatglm.h"
  22. #include "models/llm_build_codeshell.h"
  23. #include "models/llm_build_cohere2_iswa.h"
  24. #include "models/llm_build_command_r.h"
  25. #include "models/llm_build_dbrx.h"
  26. #include "models/llm_build_deci.h"
  27. #include "models/llm_build_deepseek.h"
  28. #include "models/llm_build_deepseek2.h"
  29. #include "models/llm_build_dots1.h"
  30. #include "models/llm_build_dream.h"
  31. #include "models/llm_build_ernie4_5.h"
  32. #include "models/llm_build_ernie4_5_moe.h"
  33. #include "models/llm_build_exaone.h"
  34. #include "models/llm_build_falcon.h"
  35. #include "models/llm_build_falcon_h1.h"
  36. #include "models/llm_build_gemma.h"
  37. #include "models/llm_build_gemma2_iswa.h"
  38. #include "models/llm_build_gemma3_iswa.h"
  39. #include "models/llm_build_gemma3n_iswa.h"
  40. #include "models/llm_build_gemma_embedding_iswa.h"
  41. #include "models/llm_build_glm4.h"
  42. #include "models/llm_build_glm4_moe.h"
  43. #include "models/llm_build_gpt2.h"
  44. #include "models/llm_build_gptneox.h"
  45. #include "models/llm_build_granite.h"
  46. #include "models/llm_build_granite_hybrid.h"
  47. #include "models/llm_build_grok.h"
  48. #include "models/llm_build_hunyuan_dense.h"
  49. #include "models/llm_build_hunyuan_moe.h"
  50. #include "models/llm_build_internlm2.h"
  51. #include "models/llm_build_jais.h"
  52. #include "models/llm_build_jamba.h"
  53. #include "models/llm_build_lfm2.h"
  54. #include "models/llm_build_llada.h"
  55. #include "models/llm_build_llada_moe.h"
  56. #include "models/llm_build_llama.h"
  57. #include "models/llm_build_llama_iswa.h"
  58. #include "models/llm_build_mamba.h"
  59. #include "models/llm_build_minicpm3.h"
  60. #include "models/llm_build_mpt.h"
  61. #include "models/llm_build_nemotron.h"
  62. #include "models/llm_build_nemotron_h.h"
  63. #include "models/llm_build_neo_bert.h"
  64. #include "models/llm_build_olmo.h"
  65. #include "models/llm_build_olmoe.h"
  66. #include "models/llm_build_openai_moe_iswa.h"
  67. #include "models/llm_build_openelm.h"
  68. #include "models/llm_build_orion.h"
  69. #include "models/llm_build_phi2.h"
  70. #include "models/llm_build_plamo.h"
  71. #include "models/llm_build_plamo2.h"
  72. #include "models/llm_build_plm.h"
  73. #include "models/llm_build_qwen.h"
  74. #include "models/llm_build_qwen2.h"
  75. #include "models/llm_build_qwen2moe.h"
  76. #include "models/llm_build_qwen2vl.h"
  77. #include "models/llm_build_qwen3.h"
  78. #include "models/llm_build_qwen3moe.h"
  79. #include "models/llm_build_qwen3next.h"
  80. #include "models/llm_build_refact.h"
  81. #include "models/llm_build_rwkv_base.h"
  82. #include "models/llm_build_rwkv6.h"
  83. #include "models/llm_build_rwkv6qwen2.h"
  84. #include "models/llm_build_rwkv7.h"
  85. #include "models/llm_build_arwkv7.h"
  86. #include "models/llm_build_seed_oss.h"
  87. #include "models/llm_build_smollm3.h"
  88. #include "models/llm_build_stablelm.h"
  89. #include "models/llm_build_starcoder.h"
  90. #include "models/llm_build_starcoder2.h"
  91. #include "models/llm_build_t5_dec.h"
  92. #include "models/llm_build_t5_enc.h"
  93. #include "models/llm_build_wavtokenizer_dec.h"
  94. #include "models/llm_build_xverse.h"
  95. #include "models/llm_build_exaone4.h"
  96. #include "models/llm_build_olmo2.h"
  97. #include "models/llm_build_smallthinker.h"
  98. #include "models/llm_build_phi3.h"
  99. #include <algorithm>
  100. #include <cassert>
  101. #include <cmath>
  102. #include <cfloat>
  103. #include <cstring>
  104. #include <cmath>
  105. #include <functional>
  106. #include <map>
  107. #include <regex>
  108. #include <sstream>
  109. #include <stdexcept>
  110. const char * llm_type_name(llm_type type) {
  111. switch (type) {
  112. case LLM_TYPE_14M: return "14M";
  113. case LLM_TYPE_17M: return "17M";
  114. case LLM_TYPE_22M: return "22M";
  115. case LLM_TYPE_33M: return "33M";
  116. case LLM_TYPE_60M: return "60M";
  117. case LLM_TYPE_70M: return "70M";
  118. case LLM_TYPE_80M: return "80M";
  119. case LLM_TYPE_109M: return "109M";
  120. case LLM_TYPE_137M: return "137M";
  121. case LLM_TYPE_140M: return "140M";
  122. case LLM_TYPE_160M: return "160M";
  123. case LLM_TYPE_190M: return "190M";
  124. case LLM_TYPE_220M: return "220M";
  125. case LLM_TYPE_250M: return "250M";
  126. case LLM_TYPE_256M: return "256M";
  127. case LLM_TYPE_270M: return "270M";
  128. case LLM_TYPE_335M: return "335M";
  129. case LLM_TYPE_350M: return "350M";
  130. case LLM_TYPE_360M: return "360M";
  131. case LLM_TYPE_410M: return "410M";
  132. case LLM_TYPE_450M: return "450M";
  133. case LLM_TYPE_475M: return "475M";
  134. case LLM_TYPE_558M: return "558M";
  135. case LLM_TYPE_700M: return "700M";
  136. case LLM_TYPE_770M: return "770M";
  137. case LLM_TYPE_780M: return "780M";
  138. case LLM_TYPE_950M: return "950M";
  139. case LLM_TYPE_0_3B: return "0.3B";
  140. case LLM_TYPE_0_5B: return "0.5B";
  141. case LLM_TYPE_0_6B: return "0.6B";
  142. case LLM_TYPE_1B: return "1B";
  143. case LLM_TYPE_1_2B: return "1.2B";
  144. case LLM_TYPE_1_3B: return "1.3B";
  145. case LLM_TYPE_1_4B: return "1.4B";
  146. case LLM_TYPE_1_5B: return "1.5B";
  147. case LLM_TYPE_1_6B: return "1.6B";
  148. case LLM_TYPE_1_7B: return "1.7B";
  149. case LLM_TYPE_1_8B: return "1.8B";
  150. case LLM_TYPE_2B: return "2B";
  151. case LLM_TYPE_2_8B: return "2.8B";
  152. case LLM_TYPE_2_9B: return "2.9B";
  153. case LLM_TYPE_3B: return "3B";
  154. case LLM_TYPE_4B: return "4B";
  155. case LLM_TYPE_6B: return "6B";
  156. case LLM_TYPE_6_9B: return "6.9B";
  157. case LLM_TYPE_7B: return "7B";
  158. case LLM_TYPE_8B: return "8B";
  159. case LLM_TYPE_9B: return "9B";
  160. case LLM_TYPE_11B: return "11B";
  161. case LLM_TYPE_12B: return "12B";
  162. case LLM_TYPE_13B: return "13B";
  163. case LLM_TYPE_14B: return "14B";
  164. case LLM_TYPE_15B: return "15B";
  165. case LLM_TYPE_16B: return "16B";
  166. case LLM_TYPE_20B: return "20B";
  167. case LLM_TYPE_27B: return "27B";
  168. case LLM_TYPE_30B: return "30B";
  169. case LLM_TYPE_32B: return "32B";
  170. case LLM_TYPE_34B: return "34B";
  171. case LLM_TYPE_35B: return "35B";
  172. case LLM_TYPE_36B: return "36B";
  173. case LLM_TYPE_40B: return "40B";
  174. case LLM_TYPE_65B: return "65B";
  175. case LLM_TYPE_70B: return "70B";
  176. case LLM_TYPE_120B: return "120B";
  177. case LLM_TYPE_142B: return "142B";
  178. case LLM_TYPE_236B: return "236B";
  179. case LLM_TYPE_290B: return "290B";
  180. case LLM_TYPE_314B: return "314B";
  181. case LLM_TYPE_405B: return "405B";
  182. case LLM_TYPE_671B: return "671B";
  183. case LLM_TYPE_SMALL: return "0.1B";
  184. case LLM_TYPE_MEDIUM: return "0.4B";
  185. case LLM_TYPE_LARGE: return "0.8B";
  186. case LLM_TYPE_XL: return "1.5B";
  187. case LLM_TYPE_A1_7B: return "A1.7B";
  188. case LLM_TYPE_A2_7B: return "A2.7B";
  189. case LLM_TYPE_8x7B: return "8x7B";
  190. case LLM_TYPE_8x22B: return "8x22B";
  191. case LLM_TYPE_16x12B: return "16x12B";
  192. case LLM_TYPE_16x3_8B: return "16x3.8B";
  193. case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
  194. case LLM_TYPE_57B_A14B: return "57B.A14B";
  195. case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
  196. case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
  197. case LLM_TYPE_A13B: return "A13B";
  198. case LLM_TYPE_21B_A3B: return "21B.A3B";
  199. case LLM_TYPE_30B_A3B: return "30B.A3B";
  200. case LLM_TYPE_80B_A3B: return "80B.A3B";
  201. case LLM_TYPE_106B_A12B: return "106B.A12B";
  202. case LLM_TYPE_235B_A22B: return "235B.A22B";
  203. case LLM_TYPE_300B_A47B: return "300B.A47B";
  204. case LLM_TYPE_355B_A32B: return "355B.A32B";
  205. case LLM_TYPE_E2B: return "E2B";
  206. case LLM_TYPE_E4B: return "E4B";
  207. default: return "?B";
  208. }
  209. }
  210. static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
  211. switch (type) {
  212. case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
  213. case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
  214. default: return "unknown";
  215. }
  216. }
  217. static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
  218. { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
  219. { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
  220. { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
  221. { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
  222. };
  223. std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
  224. return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
  225. }
  226. static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
  227. for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
  228. if (kv.second == name) {
  229. return (llama_rope_scaling_type) kv.first;
  230. }
  231. }
  232. return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
  233. }
  234. // checks if the weight tensor can be used with the specified buffer type and device
  235. static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
  236. GGML_ASSERT(w != nullptr);
  237. if (op == GGML_OP_NONE) {
  238. return true;
  239. }
  240. ggml_init_params params = {
  241. /*.mem_size =*/ ggml_tensor_overhead()*8,
  242. /*.mem_buffer =*/ NULL,
  243. /*.no_alloc =*/ true,
  244. };
  245. ggml_context_ptr ctx_ptr { ggml_init(params) };
  246. if (!ctx_ptr) {
  247. throw std::runtime_error(format("failed to create ggml context"));
  248. }
  249. ggml_context * ctx = ctx_ptr.get();
  250. ggml_tensor * op_tensor = nullptr;
  251. switch (op) {
  252. case GGML_OP_GET_ROWS:
  253. {
  254. ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
  255. op_tensor = ggml_get_rows(ctx, w, b);
  256. } break;
  257. case GGML_OP_MUL_MAT:
  258. {
  259. ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
  260. op_tensor = ggml_mul_mat(ctx, w, b);
  261. } break;
  262. case GGML_OP_MUL_MAT_ID:
  263. {
  264. int n_expert_used = hparams.n_expert_used;
  265. ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
  266. ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
  267. op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
  268. } break;
  269. case GGML_OP_ADD:
  270. {
  271. ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
  272. op_tensor = ggml_add(ctx, a, w);
  273. } break;
  274. case GGML_OP_ADD_ID:
  275. {
  276. int n_expert_used = hparams.n_expert_used;
  277. ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
  278. ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
  279. op_tensor = ggml_add_id(ctx, a, w, c);
  280. } break;
  281. case GGML_OP_MUL:
  282. {
  283. ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
  284. op_tensor = ggml_mul(ctx, a, w);
  285. } break;
  286. case GGML_OP_DIV:
  287. {
  288. ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
  289. op_tensor = ggml_div(ctx, a, w);
  290. } break;
  291. case GGML_OP_ROPE:
  292. {
  293. int n_embd_head = hparams.n_embd_head_v;
  294. int n_head = hparams.n_head();
  295. ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
  296. ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
  297. op_tensor = ggml_rope_ext(
  298. ctx, a, b, w,
  299. 0, 0, 0, 0, 0,
  300. 0, 0, 0, 0
  301. );
  302. } break;
  303. case GGML_OP_SSM_CONV:
  304. {
  305. const int64_t n_seq_tokens = 512;
  306. const int64_t n_seqs = 3;
  307. ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
  308. op_tensor = ggml_ssm_conv(ctx, conv_x, w);
  309. } break;
  310. case GGML_OP_SSM_SCAN:
  311. {
  312. // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
  313. const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
  314. const int64_t n_head = w->ne[1];
  315. const int64_t head_dim = hparams.ssm_d_inner / n_head;
  316. const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
  317. const int64_t n_seq_tokens = 512;
  318. const int64_t n_seqs = 3;
  319. ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
  320. ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
  321. ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
  322. ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
  323. ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
  324. ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
  325. op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
  326. } break;
  327. case GGML_OP_RWKV_WKV6:
  328. {
  329. // FIXME
  330. const int64_t S = 123;
  331. const int64_t H = 123;
  332. const int64_t n_tokens = 123;
  333. const int64_t n_seqs = 123;
  334. ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  335. ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  336. ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  337. ggml_tensor * tf = w;
  338. ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  339. ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
  340. op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
  341. } break;
  342. case GGML_OP_IM2COL:
  343. {
  344. const int n_embd = hparams.n_embd;
  345. ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
  346. op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
  347. } break;
  348. case GGML_OP_SCALE:
  349. {
  350. op_tensor = ggml_scale(ctx, w, 1.0f);
  351. } break;
  352. default:
  353. GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
  354. }
  355. // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
  356. GGML_ASSERT(w->buffer == nullptr);
  357. w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
  358. bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  359. ggml_backend_buffer_free(w->buffer);
  360. w->buffer = nullptr;
  361. return op_supported;
  362. }
  363. // lists of buffer types used for each layer
  364. using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
  365. // find the first buffer type in the list that can use the tensor
  366. static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
  367. GGML_ASSERT(!buft_list.empty());
  368. for (const auto & cur : buft_list) {
  369. ggml_backend_dev_t cur_dev = cur.first;
  370. ggml_backend_buffer_type_t cur_buft = cur.second;
  371. if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
  372. return cur_buft;
  373. }
  374. }
  375. return nullptr;
  376. }
  377. // CPU: ACCEL -> GPU host -> CPU extra -> CPU
  378. static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts) {
  379. buft_list_t buft_list;
  380. // add ACCEL buffer types
  381. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  382. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  383. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
  384. auto * buft = ggml_backend_dev_buffer_type(dev);
  385. // skip
  386. if (buft != ggml_backend_cpu_buffer_type()) {
  387. buft_list.emplace_back(dev, buft);
  388. }
  389. }
  390. }
  391. // add a host buffer type
  392. // storing the tensors in a host buffer is useful when the processing of large batches
  393. // is offloaded to a GPU device, since it reduces the time spent on data transfers
  394. // generally, this will be done using the first device in the list
  395. // a better approach would be to handle this on a weight-by-weight basis using the offload_op
  396. // function of the device to determine if it would benefit from being stored in a host buffer
  397. for (auto * dev : devices) {
  398. ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
  399. if (buft) {
  400. buft_list.emplace_back(dev, buft);
  401. break;
  402. }
  403. }
  404. // add extra buffer types
  405. if (use_extra_bufts) {
  406. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  407. if (cpu_dev == nullptr) {
  408. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  409. }
  410. auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
  411. auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
  412. ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
  413. if (ggml_backend_dev_get_extra_bufts_fn) {
  414. ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
  415. while (extra_bufts && *extra_bufts) {
  416. buft_list.emplace_back(cpu_dev, *extra_bufts);
  417. ++extra_bufts;
  418. }
  419. }
  420. }
  421. // add the CPU buffer type
  422. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  423. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  424. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
  425. buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
  426. }
  427. }
  428. return buft_list;
  429. }
  430. // GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
  431. static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode split_mode, const float * tensor_split) {
  432. buft_list_t buft_list;
  433. // add the device split buffer type if requested and available
  434. if (split_mode == LLAMA_SPLIT_MODE_ROW) {
  435. ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
  436. auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
  437. ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
  438. if (ggml_backend_split_buffer_type_fn) {
  439. size_t dev_index = [&]() {
  440. auto * reg = ggml_backend_dev_backend_reg(dev);
  441. for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
  442. if (ggml_backend_reg_dev_get(reg, i) == dev) {
  443. return i;
  444. }
  445. }
  446. throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
  447. }();
  448. auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
  449. if (buft != nullptr) {
  450. buft_list.emplace_back(dev, buft);
  451. }
  452. }
  453. }
  454. // add the device default buffer type
  455. buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
  456. return buft_list;
  457. }
  458. struct llama_model::impl {
  459. impl() {}
  460. ~impl() {}
  461. uint64_t n_elements = 0;
  462. size_t n_bytes = 0;
  463. std::string desc_str;
  464. // model memory mapped files
  465. llama_mmaps mappings;
  466. // objects representing data potentially being locked in memory
  467. llama_mlocks mlock_bufs;
  468. llama_mlocks mlock_mmaps;
  469. // contexts where the model tensors metadata is stored
  470. std::vector<ggml_context_ptr> ctxs;
  471. // the model memory buffers for the tensor data
  472. std::vector<ggml_backend_buffer_ptr> bufs;
  473. buft_list_t cpu_buft_list;
  474. std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
  475. struct layer_dev {
  476. ggml_backend_dev_t dev;
  477. buft_list_t * buft_list;
  478. };
  479. layer_dev dev_input = {};
  480. layer_dev dev_output = {};
  481. std::vector<layer_dev> dev_layer;
  482. bool has_tensor_overrides;
  483. };
  484. llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
  485. pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
  486. }
  487. llama_model::~llama_model() {}
  488. void llama_model::load_stats(llama_model_loader & ml) {
  489. pimpl->n_elements = ml.n_elements;
  490. pimpl->n_bytes = ml.n_bytes;
  491. }
  492. void llama_model::load_arch(llama_model_loader & ml) {
  493. arch = ml.get_arch();
  494. if (arch == LLM_ARCH_UNKNOWN) {
  495. throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
  496. }
  497. }
  498. void llama_model::load_hparams(llama_model_loader & ml) {
  499. const gguf_context * ctx = ml.meta.get();
  500. // get metadata as string
  501. for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
  502. gguf_type type = gguf_get_kv_type(ctx, i);
  503. if (type == GGUF_TYPE_ARRAY) {
  504. continue;
  505. }
  506. const char * name = gguf_get_key(ctx, i);
  507. const std::string value = gguf_kv_to_str(ctx, i);
  508. gguf_kv.emplace(name, value);
  509. }
  510. // get general kv
  511. ml.get_key(LLM_KV_GENERAL_NAME, name, false);
  512. // everything past this point is not vocab-related
  513. if (hparams.vocab_only) {
  514. return;
  515. }
  516. ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
  517. ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
  518. ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
  519. ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
  520. ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
  521. if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
  522. ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
  523. ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
  524. ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
  525. ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
  526. ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
  527. }
  528. GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
  529. GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
  530. if (hparams.n_expert > 0) {
  531. GGML_ASSERT(hparams.n_expert_used > 0);
  532. } else {
  533. GGML_ASSERT(hparams.n_expert_used == 0);
  534. }
  535. std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
  536. std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
  537. std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
  538. std::fill(
  539. hparams.recurrent_layer_arr.begin(),
  540. hparams.recurrent_layer_arr.end(),
  541. llm_arch_is_recurrent(ml.get_arch()));
  542. std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
  543. std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
  544. ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
  545. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
  546. // n_head_kv is optional, default to n_head
  547. hparams.n_head_kv_arr = hparams.n_head_arr;
  548. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
  549. bool rope_finetuned = false;
  550. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  551. hparams.rope_finetuned = rope_finetuned;
  552. hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
  553. ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
  554. // rope_freq_base (optional)
  555. hparams.rope_freq_base_train = 10000.0f;
  556. ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
  557. std::string rope_scaling("linear");
  558. ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
  559. hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
  560. GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
  561. // rope_freq_scale (inverse of the kv) is optional
  562. float ropescale = 0.0f;
  563. if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
  564. // try the old key name
  565. ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
  566. }
  567. hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
  568. // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
  569. hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
  570. hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
  571. ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
  572. // non-transformer models do not have attention heads
  573. if (hparams.n_head() > 0) {
  574. // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
  575. // gpt-j n_rot = rotary_dim
  576. hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
  577. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
  578. hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
  579. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
  580. // sanity check for n_rot (optional)
  581. hparams.n_rot = hparams.n_embd_head_k;
  582. ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
  583. if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
  584. if (hparams.n_rot != hparams.n_embd_head_k) {
  585. throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
  586. }
  587. }
  588. } else {
  589. hparams.n_rot = 0;
  590. hparams.n_embd_head_k = 0;
  591. hparams.n_embd_head_v = 0;
  592. }
  593. // for differentiating model types
  594. uint32_t n_vocab = 0;
  595. ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
  596. // for classifier models
  597. ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
  598. if (!classifier_labels.empty()) {
  599. hparams.n_cls_out = classifier_labels.size();
  600. }
  601. // arch-specific KVs
  602. switch (arch) {
  603. case LLM_ARCH_LLAMA:
  604. {
  605. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  606. if (hparams.n_expert == 8) {
  607. switch (hparams.n_layer) {
  608. case 32: type = LLM_TYPE_8x7B; break;
  609. case 56: type = LLM_TYPE_8x22B; break;
  610. default: type = LLM_TYPE_UNKNOWN;
  611. }
  612. } else {
  613. switch (hparams.n_layer) {
  614. case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
  615. case 22: type = LLM_TYPE_1B; break;
  616. case 26: type = LLM_TYPE_3B; break;
  617. case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
  618. case 30: type = LLM_TYPE_256M; break; // smoldocling 256M
  619. // granite uses a vocab with len 49152
  620. case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
  621. case 36: type = LLM_TYPE_8B; break; // granite
  622. case 40: type = LLM_TYPE_13B; break;
  623. case 48: type = LLM_TYPE_34B; break;
  624. case 60: type = LLM_TYPE_30B; break;
  625. case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
  626. default: type = LLM_TYPE_UNKNOWN;
  627. }
  628. }
  629. } break;
  630. case LLM_ARCH_LLAMA4:
  631. {
  632. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  633. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  634. ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
  635. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  636. if (found_swa && hparams.n_swa == 0) {
  637. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  638. hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
  639. } else {
  640. hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
  641. hparams.n_swa = 8192;
  642. hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
  643. }
  644. switch (hparams.n_expert) {
  645. case 0: {
  646. // MobileLLM (no MoE)
  647. switch (hparams.n_embd) {
  648. case 2048: type = LLM_TYPE_140M; break;
  649. case 4096: type = LLM_TYPE_360M; break;
  650. case 6144: type = LLM_TYPE_950M; break;
  651. default: type = LLM_TYPE_UNKNOWN;
  652. }
  653. } break;
  654. case 16: type = LLM_TYPE_17B_16E; break;
  655. case 128: type = LLM_TYPE_17B_128E; break;
  656. default: type = LLM_TYPE_UNKNOWN;
  657. }
  658. hparams.use_kq_norm = type != LLM_TYPE_17B_128E;
  659. } break;
  660. case LLM_ARCH_ARCEE:
  661. {
  662. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  663. // Arcee uses the same structure as Llama
  664. switch (hparams.n_layer) {
  665. case 36: type = LLM_TYPE_4B; break;
  666. default: type = LLM_TYPE_UNKNOWN;
  667. }
  668. } break;
  669. case LLM_ARCH_DECI:
  670. {
  671. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  672. switch (hparams.n_layer) {
  673. case 32: type = LLM_TYPE_7B; break;
  674. case 80: type = LLM_TYPE_70B; break;
  675. case 162: type = LLM_TYPE_405B; break;
  676. default: type = LLM_TYPE_UNKNOWN;
  677. }
  678. } break;
  679. case LLM_ARCH_MINICPM:
  680. {
  681. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  682. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  683. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  684. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  685. // MiniCPM uses rope by default, unlike Granite which uses it as a switch
  686. hparams.rope_finetuned = true;
  687. switch (hparams.n_layer) {
  688. case 52: type = LLM_TYPE_1B; break;
  689. case 40: type = LLM_TYPE_2B; break;
  690. default: type = LLM_TYPE_UNKNOWN;
  691. }
  692. } break;
  693. case LLM_ARCH_MINICPM3:
  694. {
  695. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  696. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  697. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  698. switch (hparams.n_layer) {
  699. case 62: type = LLM_TYPE_4B; break;
  700. default: type = LLM_TYPE_UNKNOWN;
  701. }
  702. } break;
  703. case LLM_ARCH_GROK:
  704. {
  705. // defaults for old GGUFs
  706. hparams.yarn_beta_fast = 8.0f;
  707. hparams.f_logit_scale = 0.5773502691896257f;
  708. hparams.f_embedding_scale = 78.38367176906169f;
  709. hparams.f_attn_out_scale = 0.08838834764831845f;
  710. hparams.f_attn_logit_softcapping = 30.0f;
  711. hparams.f_router_logit_softcapping = 30.0f;
  712. // no final_logit_softcapping in grok-1
  713. hparams.f_final_logit_softcapping = 0.0f;
  714. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  715. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  716. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false);
  717. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false);
  718. ml.get_key(LLM_KV_ATTENTION_OUTPUT_SCALE, hparams.f_attn_out_scale, false);
  719. ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
  720. ml.get_key(LLM_KV_ROUTER_LOGIT_SOFTCAPPING, hparams.f_router_logit_softcapping, false);
  721. ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
  722. ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.attn_temp_length, false);
  723. ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor, false);
  724. ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
  725. ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
  726. ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
  727. switch (hparams.n_layer) {
  728. case 64: type = LLM_TYPE_314B; break;
  729. default: type = LLM_TYPE_UNKNOWN;
  730. }
  731. } break;
  732. case LLM_ARCH_FALCON:
  733. {
  734. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  735. switch (hparams.n_layer) {
  736. case 32: type = LLM_TYPE_7B; break;
  737. case 60: type = LLM_TYPE_40B; break;
  738. default: type = LLM_TYPE_UNKNOWN;
  739. }
  740. } break;
  741. case LLM_ARCH_BAICHUAN:
  742. {
  743. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  744. switch (hparams.n_layer) {
  745. case 32: type = LLM_TYPE_7B; break;
  746. case 40: type = LLM_TYPE_13B; break;
  747. default: type = LLM_TYPE_UNKNOWN;
  748. }
  749. if (type == LLM_TYPE_13B) {
  750. // TODO: become GGUF KV parameter
  751. hparams.f_max_alibi_bias = 8.0f;
  752. }
  753. } break;
  754. case LLM_ARCH_STARCODER:
  755. {
  756. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  757. switch (hparams.n_layer) {
  758. case 24: type = LLM_TYPE_1B; break;
  759. case 36: type = LLM_TYPE_3B; break;
  760. case 42: type = LLM_TYPE_7B; break;
  761. case 40: type = LLM_TYPE_15B; break;
  762. default: type = LLM_TYPE_UNKNOWN;
  763. }
  764. } break;
  765. case LLM_ARCH_REFACT:
  766. {
  767. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  768. switch (hparams.n_layer) {
  769. case 32: type = LLM_TYPE_1B; break;
  770. default: type = LLM_TYPE_UNKNOWN;
  771. }
  772. // TODO: become GGUF KV parameter
  773. hparams.f_max_alibi_bias = 8.0f;
  774. } break;
  775. case LLM_ARCH_BERT:
  776. {
  777. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  778. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  779. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  780. switch (hparams.n_layer) {
  781. case 3:
  782. type = LLM_TYPE_17M; break; // bge-micro
  783. case 6:
  784. type = LLM_TYPE_22M; break; // MiniLM-L6
  785. case 12:
  786. switch (hparams.n_embd) {
  787. case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
  788. case 768: type = LLM_TYPE_109M; break; // bge-base
  789. default: type = LLM_TYPE_UNKNOWN;
  790. } break;
  791. case 24:
  792. type = LLM_TYPE_335M; break; // bge-large
  793. default: type = LLM_TYPE_UNKNOWN;
  794. }
  795. } break;
  796. case LLM_ARCH_JINA_BERT_V2:
  797. {
  798. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  799. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  800. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  801. hparams.f_max_alibi_bias = 8.0f;
  802. switch (hparams.n_layer) {
  803. case 4: type = LLM_TYPE_33M; break; // jina-embeddings-small
  804. case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
  805. default: type = LLM_TYPE_UNKNOWN;
  806. }
  807. } break;
  808. case LLM_ARCH_JINA_BERT_V3:
  809. {
  810. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  811. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  812. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  813. switch (hparams.n_layer) {
  814. case 24:
  815. type = LLM_TYPE_558M; break;
  816. default: type = LLM_TYPE_UNKNOWN;
  817. }
  818. } break;
  819. case LLM_ARCH_NOMIC_BERT:
  820. case LLM_ARCH_NOMIC_BERT_MOE:
  821. {
  822. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  823. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  824. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  825. ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
  826. if (hparams.n_layer == 12 && hparams.n_embd == 768) {
  827. if (arch == LLM_ARCH_NOMIC_BERT) {
  828. type = LLM_TYPE_137M;
  829. } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
  830. type = LLM_TYPE_475M;
  831. }
  832. }
  833. } break;
  834. case LLM_ARCH_NEO_BERT:
  835. {
  836. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  837. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  838. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  839. if (hparams.n_layer == 28) {
  840. type = LLM_TYPE_250M;
  841. }
  842. } break;
  843. case LLM_ARCH_BLOOM:
  844. {
  845. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  846. switch (hparams.n_layer) {
  847. case 24: type = LLM_TYPE_1B; break;
  848. case 30:
  849. switch (hparams.n_embd) {
  850. case 2560: type = LLM_TYPE_3B; break;
  851. case 4096: type = LLM_TYPE_7B; break;
  852. default: type = LLM_TYPE_UNKNOWN;
  853. } break;
  854. default: type = LLM_TYPE_UNKNOWN;
  855. }
  856. // TODO: become GGUF KV parameter
  857. hparams.f_max_alibi_bias = 8.0f;
  858. } break;
  859. case LLM_ARCH_MPT:
  860. {
  861. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  862. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  863. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  864. switch (hparams.n_layer) {
  865. case 32: type = LLM_TYPE_7B; break;
  866. case 48: type = LLM_TYPE_30B; break;
  867. default: type = LLM_TYPE_UNKNOWN;
  868. }
  869. } break;
  870. case LLM_ARCH_STABLELM:
  871. {
  872. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  873. switch (hparams.n_layer) {
  874. case 24: type = LLM_TYPE_1B; break;
  875. case 32: type = LLM_TYPE_3B; break;
  876. case 40: type = LLM_TYPE_12B; break;
  877. default: type = LLM_TYPE_UNKNOWN;
  878. }
  879. } break;
  880. case LLM_ARCH_QWEN:
  881. {
  882. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  883. switch (hparams.n_layer) {
  884. case 32: type = LLM_TYPE_7B; break;
  885. case 40: type = LLM_TYPE_13B; break;
  886. default: type = LLM_TYPE_UNKNOWN;
  887. }
  888. } break;
  889. case LLM_ARCH_QWEN2VL:
  890. {
  891. ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
  892. }
  893. // fall through
  894. case LLM_ARCH_QWEN2:
  895. {
  896. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  897. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  898. switch (hparams.n_layer) {
  899. case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
  900. case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
  901. case 32: type = LLM_TYPE_7B; break;
  902. case 36: type = LLM_TYPE_3B; break;
  903. case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
  904. case 48: type = LLM_TYPE_14B; break;
  905. case 64: type = LLM_TYPE_32B; break;
  906. case 80: type = LLM_TYPE_70B; break;
  907. default: type = LLM_TYPE_UNKNOWN;
  908. }
  909. } break;
  910. case LLM_ARCH_DREAM:
  911. {
  912. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  913. // Dream models are primarily 7B with 28 layers
  914. switch (hparams.n_layer) {
  915. case 28:
  916. type = LLM_TYPE_7B;
  917. break;
  918. default:
  919. type = LLM_TYPE_UNKNOWN;
  920. }
  921. // Set non-causal attention for diffusion models
  922. hparams.causal_attn = false;
  923. }
  924. break;
  925. case LLM_ARCH_LLADA:
  926. {
  927. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  928. // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
  929. switch (hparams.n_layer) {
  930. case 32:
  931. type = LLM_TYPE_8B;
  932. break;
  933. default:
  934. type = LLM_TYPE_UNKNOWN;
  935. }
  936. // Set non-causal attention for diffusion models
  937. hparams.causal_attn = false;
  938. }
  939. break;
  940. case LLM_ARCH_LLADA_MOE:
  941. {
  942. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  943. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  944. // diffusion language model uses non-causal attention
  945. hparams.causal_attn = false;
  946. switch (hparams.n_layer) {
  947. case 16: type = LLM_TYPE_A1_7B; break;
  948. default: type = LLM_TYPE_UNKNOWN;
  949. }
  950. } break;
  951. case LLM_ARCH_QWEN2MOE:
  952. {
  953. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  954. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  955. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  956. switch (hparams.n_layer) {
  957. case 24: type = LLM_TYPE_A2_7B; break;
  958. case 28: type = LLM_TYPE_57B_A14B; break;
  959. default: type = LLM_TYPE_UNKNOWN;
  960. }
  961. } break;
  962. case LLM_ARCH_QWEN3:
  963. {
  964. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  965. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  966. switch (hparams.n_layer) {
  967. case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
  968. case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
  969. case 40: type = LLM_TYPE_14B; break;
  970. case 64: type = LLM_TYPE_32B; break;
  971. default: type = LLM_TYPE_UNKNOWN;
  972. }
  973. } break;
  974. case LLM_ARCH_QWEN3MOE:
  975. {
  976. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  977. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  978. switch (hparams.n_layer) {
  979. case 48: type = LLM_TYPE_30B_A3B; break;
  980. case 94: type = LLM_TYPE_235B_A22B; break;
  981. default: type = LLM_TYPE_UNKNOWN;
  982. }
  983. } break;
  984. case LLM_ARCH_PHI2:
  985. {
  986. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  987. switch (hparams.n_layer) {
  988. case 24: type = LLM_TYPE_1B; break;
  989. case 32: type = LLM_TYPE_3B; break;
  990. default: type = LLM_TYPE_UNKNOWN;
  991. }
  992. } break;
  993. case LLM_ARCH_PHI3:
  994. {
  995. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  996. switch (hparams.n_layer) {
  997. case 24: type = LLM_TYPE_1B; break;
  998. case 32: type = LLM_TYPE_3B; break;
  999. case 40: type = LLM_TYPE_14B; break;
  1000. default: type = LLM_TYPE_UNKNOWN;
  1001. }
  1002. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1003. if (found_swa && hparams.n_swa > 0) {
  1004. LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
  1005. __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
  1006. // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
  1007. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  1008. hparams.n_swa = 0;
  1009. hparams.set_swa_pattern(1);
  1010. }
  1011. } break;
  1012. case LLM_ARCH_PHIMOE:
  1013. {
  1014. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1015. switch (hparams.n_layer) {
  1016. case 32: type = LLM_TYPE_16x3_8B; break;
  1017. default: type = LLM_TYPE_UNKNOWN;
  1018. }
  1019. } break;
  1020. case LLM_ARCH_PLAMO:
  1021. {
  1022. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1023. switch (hparams.n_layer) {
  1024. case 40: type = LLM_TYPE_13B; break;
  1025. default: type = LLM_TYPE_UNKNOWN;
  1026. }
  1027. } break;
  1028. case LLM_ARCH_PLAMO2:
  1029. {
  1030. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1031. // Load Mamba SSM parameters
  1032. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1033. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1034. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1035. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1036. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1037. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1038. hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
  1039. }
  1040. switch (hparams.n_layer) {
  1041. case 16: type = LLM_TYPE_1B; break;
  1042. case 32:
  1043. if (hparams.n_embd == 2048) {
  1044. type = LLM_TYPE_2B;
  1045. } else if (hparams.n_embd == 4096) {
  1046. type = LLM_TYPE_8B;
  1047. }
  1048. break;
  1049. default: type = LLM_TYPE_UNKNOWN;
  1050. }
  1051. } break;
  1052. case LLM_ARCH_GPT2:
  1053. {
  1054. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1055. switch (hparams.n_layer) {
  1056. case 12: type = LLM_TYPE_SMALL; break;
  1057. case 24: type = LLM_TYPE_MEDIUM; break;
  1058. case 36: type = LLM_TYPE_LARGE; break;
  1059. case 48: type = LLM_TYPE_XL; break;
  1060. default: type = LLM_TYPE_UNKNOWN;
  1061. }
  1062. } break;
  1063. case LLM_ARCH_CODESHELL:
  1064. {
  1065. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1066. switch (hparams.n_layer) {
  1067. case 42: type = LLM_TYPE_7B; break;
  1068. default: type = LLM_TYPE_UNKNOWN;
  1069. }
  1070. } break;
  1071. case LLM_ARCH_ORION:
  1072. {
  1073. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1074. switch (hparams.n_layer) {
  1075. case 40: type = LLM_TYPE_14B; break;
  1076. default: type = LLM_TYPE_UNKNOWN;
  1077. }
  1078. } break;
  1079. case LLM_ARCH_INTERNLM2:
  1080. {
  1081. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1082. switch (hparams.n_layer) {
  1083. case 32: type = LLM_TYPE_7B; break;
  1084. case 48: type = LLM_TYPE_20B; break;
  1085. default: type = LLM_TYPE_UNKNOWN;
  1086. }
  1087. } break;
  1088. case LLM_ARCH_GEMMA:
  1089. {
  1090. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1091. switch (hparams.n_layer) {
  1092. case 18: type = LLM_TYPE_2B; break;
  1093. case 28: type = LLM_TYPE_7B; break;
  1094. default: type = LLM_TYPE_UNKNOWN;
  1095. }
  1096. } break;
  1097. case LLM_ARCH_GEMMA2:
  1098. {
  1099. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1100. hparams.n_swa = 4096; // default value of gemma 2
  1101. hparams.set_swa_pattern(2);
  1102. hparams.attn_soft_cap = true;
  1103. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1104. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1105. ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
  1106. ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
  1107. switch (hparams.n_layer) {
  1108. case 26: type = LLM_TYPE_2B; break;
  1109. case 42: type = LLM_TYPE_9B; break;
  1110. case 46: type = LLM_TYPE_27B; break;
  1111. default: type = LLM_TYPE_UNKNOWN;
  1112. }
  1113. // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
  1114. hparams.f_attention_scale = type == LLM_TYPE_27B
  1115. ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
  1116. : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
  1117. } break;
  1118. case LLM_ARCH_GEMMA3:
  1119. {
  1120. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1121. hparams.set_swa_pattern(6);
  1122. hparams.rope_freq_base_train_swa = 10000.0f;
  1123. hparams.rope_freq_scale_train_swa = 1.0f;
  1124. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1125. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1126. switch (hparams.n_layer) {
  1127. case 18: type = LLM_TYPE_270M; break;
  1128. case 26: type = LLM_TYPE_1B; break;
  1129. case 34: type = LLM_TYPE_4B; break;
  1130. case 48: type = LLM_TYPE_12B; break;
  1131. case 62: type = LLM_TYPE_27B; break;
  1132. default: type = LLM_TYPE_UNKNOWN;
  1133. }
  1134. // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
  1135. hparams.f_attention_scale = type == LLM_TYPE_27B
  1136. ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
  1137. : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
  1138. } break;
  1139. case LLM_ARCH_GEMMA3N:
  1140. {
  1141. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1142. hparams.set_swa_pattern(5);
  1143. hparams.n_layer_kv_from_start = 20;
  1144. hparams.rope_freq_base_train_swa = 10000.0f;
  1145. hparams.rope_freq_scale_train_swa = 1.0f;
  1146. hparams.f_attention_scale = 1.0f;
  1147. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1148. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1149. switch (hparams.n_layer) {
  1150. case 30: type = LLM_TYPE_E2B; break;
  1151. case 35: type = LLM_TYPE_E4B; break;
  1152. default: type = LLM_TYPE_UNKNOWN;
  1153. }
  1154. } break;
  1155. case LLM_ARCH_GEMMA_EMBEDDING:
  1156. {
  1157. hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
  1158. hparams.set_swa_pattern(6);
  1159. hparams.causal_attn = false; // embeddings do not use causal attention
  1160. hparams.rope_freq_base_train_swa = 10000.0f;
  1161. hparams.rope_freq_scale_train_swa = 1.0f;
  1162. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1163. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1164. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  1165. switch (hparams.n_layer) {
  1166. case 24: type = LLM_TYPE_0_3B; break;
  1167. default: type = LLM_TYPE_UNKNOWN;
  1168. }
  1169. hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
  1170. } break;
  1171. case LLM_ARCH_STARCODER2:
  1172. {
  1173. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1174. switch (hparams.n_layer) {
  1175. case 30: type = LLM_TYPE_3B; break;
  1176. case 32: type = LLM_TYPE_7B; break;
  1177. case 40: type = LLM_TYPE_15B; break;
  1178. case 52: type = LLM_TYPE_20B; break; // granite
  1179. case 88: type = LLM_TYPE_34B; break; // granite
  1180. default: type = LLM_TYPE_UNKNOWN;
  1181. }
  1182. } break;
  1183. case LLM_ARCH_MAMBA:
  1184. {
  1185. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1186. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1187. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1188. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1189. ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
  1190. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1191. switch (hparams.n_layer) {
  1192. case 24:
  1193. switch (hparams.n_embd) {
  1194. case 768: type = LLM_TYPE_SMALL; break;
  1195. default: type = LLM_TYPE_UNKNOWN;
  1196. } break;
  1197. case 48:
  1198. switch (hparams.n_embd) {
  1199. case 1024: type = LLM_TYPE_MEDIUM; break;
  1200. case 1536: type = LLM_TYPE_LARGE; break;
  1201. case 2048: type = LLM_TYPE_XL; break;
  1202. default: type = LLM_TYPE_UNKNOWN;
  1203. } break;
  1204. case 64:
  1205. switch (hparams.n_embd) {
  1206. case 2560: type = LLM_TYPE_3B; break;
  1207. default: type = LLM_TYPE_UNKNOWN;
  1208. } break;
  1209. default: type = LLM_TYPE_UNKNOWN;
  1210. }
  1211. } break;
  1212. case LLM_ARCH_MAMBA2:
  1213. {
  1214. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1215. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1216. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1217. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1218. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1219. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1220. switch (hparams.n_layer) {
  1221. case 24:
  1222. switch (hparams.n_embd) {
  1223. case 768: type = LLM_TYPE_SMALL; break;
  1224. default: type = LLM_TYPE_UNKNOWN;
  1225. } break;
  1226. case 48:
  1227. switch (hparams.n_embd) {
  1228. case 1024: type = LLM_TYPE_MEDIUM; break;
  1229. case 1536: type = LLM_TYPE_LARGE; break;
  1230. case 2048: type = LLM_TYPE_XL; break;
  1231. default: type = LLM_TYPE_UNKNOWN;
  1232. } break;
  1233. case 64:
  1234. switch (hparams.n_embd) {
  1235. case 2560: type = LLM_TYPE_3B; break;
  1236. case 4096: type = LLM_TYPE_7B; break;
  1237. default: type = LLM_TYPE_UNKNOWN;
  1238. } break;
  1239. default: type = LLM_TYPE_UNKNOWN;
  1240. }
  1241. } break;
  1242. case LLM_ARCH_JAMBA:
  1243. {
  1244. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1245. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1246. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1247. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1248. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1249. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1250. hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
  1251. }
  1252. switch (hparams.n_layer) {
  1253. // TODO: Jamba layers are a bit heterogenous, so naming this is hard.
  1254. case 12: // 900M 8x???M
  1255. case 32: // 51B 16x?B
  1256. default: type = LLM_TYPE_UNKNOWN;
  1257. }
  1258. } break;
  1259. case LLM_ARCH_XVERSE:
  1260. {
  1261. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1262. switch (hparams.n_layer) {
  1263. case 32: type = LLM_TYPE_7B; break;
  1264. case 40: type = LLM_TYPE_13B; break;
  1265. case 80: type = LLM_TYPE_65B; break;
  1266. default: type = LLM_TYPE_UNKNOWN;
  1267. }
  1268. } break;
  1269. case LLM_ARCH_COMMAND_R:
  1270. {
  1271. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1272. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1273. switch (hparams.n_layer) {
  1274. case 40: type = LLM_TYPE_35B; break;
  1275. default: type = LLM_TYPE_UNKNOWN;
  1276. }
  1277. } break;
  1278. case LLM_ARCH_COHERE2:
  1279. {
  1280. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1281. hparams.set_swa_pattern(4);
  1282. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1283. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1284. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1285. switch (hparams.n_layer) {
  1286. case 32: type = LLM_TYPE_8B; break;
  1287. default: type = LLM_TYPE_UNKNOWN;
  1288. }
  1289. } break;
  1290. case LLM_ARCH_DBRX:
  1291. {
  1292. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1293. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
  1294. switch (hparams.n_layer) {
  1295. case 40: type = LLM_TYPE_16x12B; break;
  1296. default: type = LLM_TYPE_UNKNOWN;
  1297. }
  1298. } break;
  1299. case LLM_ARCH_OLMO:
  1300. {
  1301. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1302. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  1303. switch (hparams.n_layer) {
  1304. case 22: type = LLM_TYPE_1B; break;
  1305. case 32: type = LLM_TYPE_7B; break;
  1306. case 80: type = LLM_TYPE_70B; break;
  1307. default: type = LLM_TYPE_UNKNOWN;
  1308. }
  1309. } break;
  1310. case LLM_ARCH_OLMO2:
  1311. {
  1312. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1313. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1314. if (found_swa && hparams.n_swa > 0) {
  1315. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1316. hparams.set_swa_pattern(4);
  1317. } else {
  1318. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  1319. }
  1320. switch (hparams.n_layer) {
  1321. case 16: type = LLM_TYPE_1B; break;
  1322. case 32: type = LLM_TYPE_7B; break;
  1323. case 40: type = LLM_TYPE_13B; break;
  1324. case 64: type = LLM_TYPE_32B; break;
  1325. default: type = LLM_TYPE_UNKNOWN;
  1326. }
  1327. } break;
  1328. case LLM_ARCH_SEED_OSS:
  1329. {
  1330. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1331. switch (hparams.n_layer) {
  1332. case 64: type = LLM_TYPE_36B; break;
  1333. default: type = LLM_TYPE_UNKNOWN;
  1334. }
  1335. } break;
  1336. case LLM_ARCH_OLMOE:
  1337. {
  1338. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1339. switch (hparams.n_layer) {
  1340. case 16: type = LLM_TYPE_A1_7B; break;
  1341. default: type = LLM_TYPE_UNKNOWN;
  1342. }
  1343. } break;
  1344. case LLM_ARCH_OPENELM:
  1345. {
  1346. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1347. switch (hparams.n_layer) {
  1348. case 16: type = LLM_TYPE_270M; break;
  1349. case 20: type = LLM_TYPE_450M; break;
  1350. case 28: type = LLM_TYPE_1B; break;
  1351. case 36: type = LLM_TYPE_3B; break;
  1352. default: type = LLM_TYPE_UNKNOWN;
  1353. }
  1354. } break;
  1355. case LLM_ARCH_GPTNEOX:
  1356. {
  1357. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1358. ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
  1359. switch (hparams.n_layer) {
  1360. case 6:
  1361. switch (hparams.n_ff()) {
  1362. case 512: type = LLM_TYPE_14M; break;
  1363. case 2048: type = LLM_TYPE_70M; break;
  1364. default: type = LLM_TYPE_UNKNOWN;
  1365. } break;
  1366. case 12:
  1367. switch (hparams.n_ff()) {
  1368. case 3072: type = LLM_TYPE_160M; break;
  1369. default: type = LLM_TYPE_UNKNOWN;
  1370. } break;
  1371. case 16:
  1372. switch (hparams.n_ff()) {
  1373. case 8192: type = LLM_TYPE_1B; break;
  1374. default: type = LLM_TYPE_UNKNOWN;
  1375. } break;
  1376. case 24:
  1377. switch (hparams.n_ff()) {
  1378. case 4096: type = LLM_TYPE_410M; break;
  1379. case 8192: type = LLM_TYPE_1_4B; break;
  1380. default: type = LLM_TYPE_UNKNOWN;
  1381. } break;
  1382. case 32:
  1383. switch (hparams.n_ff()) {
  1384. case 10240: type = LLM_TYPE_2_8B; break;
  1385. case 16384: type = LLM_TYPE_6_9B; break;
  1386. default: type = LLM_TYPE_UNKNOWN;
  1387. } break;
  1388. case 36:
  1389. switch (hparams.n_ff()) {
  1390. case 20480: type = LLM_TYPE_12B; break;
  1391. default: type = LLM_TYPE_UNKNOWN;
  1392. } break;
  1393. case 44:
  1394. switch (hparams.n_ff()) {
  1395. case 24576: type = LLM_TYPE_20B; break;
  1396. default: type = LLM_TYPE_UNKNOWN;
  1397. } break;
  1398. default: type = LLM_TYPE_UNKNOWN;
  1399. }
  1400. } break;
  1401. case LLM_ARCH_ARCTIC:
  1402. {
  1403. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1404. if (hparams.n_expert == 128) {
  1405. switch (hparams.n_layer) {
  1406. case 35: type = LLM_TYPE_10B_128x3_66B; break;
  1407. default: type = LLM_TYPE_UNKNOWN;
  1408. }
  1409. } else {
  1410. type = LLM_TYPE_UNKNOWN;
  1411. }
  1412. } break;
  1413. case LLM_ARCH_DEEPSEEK:
  1414. {
  1415. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1416. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1417. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1418. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1419. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1420. switch (hparams.n_layer) {
  1421. case 28: type = LLM_TYPE_20B; break;
  1422. default: type = LLM_TYPE_UNKNOWN;
  1423. }
  1424. } break;
  1425. case LLM_ARCH_DEEPSEEK2:
  1426. {
  1427. bool is_lite = (hparams.n_layer == 27);
  1428. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1429. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1430. if (!is_lite) {
  1431. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  1432. }
  1433. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  1434. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
  1435. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
  1436. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1437. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1438. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1439. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1440. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1441. if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
  1442. // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
  1443. // that have no expert_gating_func model parameter set
  1444. hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
  1445. }
  1446. ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
  1447. switch (hparams.n_layer) {
  1448. case 27: type = LLM_TYPE_16B; break;
  1449. case 60: type = LLM_TYPE_236B; break;
  1450. case 61: type = LLM_TYPE_671B; break;
  1451. default: type = LLM_TYPE_UNKNOWN;
  1452. }
  1453. } break;
  1454. case LLM_ARCH_PLM:
  1455. {
  1456. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1457. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  1458. switch (hparams.n_layer) {
  1459. case 32: type = LLM_TYPE_1_8B; break;
  1460. default: type = LLM_TYPE_UNKNOWN;
  1461. }
  1462. } break;
  1463. case LLM_ARCH_CHATGLM:
  1464. {
  1465. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1466. switch (hparams.n_layer) {
  1467. case 28: {
  1468. if (hparams.n_head(0) == 16) {
  1469. type = LLM_TYPE_1_5B;
  1470. } else {
  1471. type = LLM_TYPE_6B;
  1472. }
  1473. } break;
  1474. case 40: {
  1475. if (hparams.n_head(0) == 24) {
  1476. type = LLM_TYPE_4B;
  1477. } else {
  1478. type = LLM_TYPE_9B;
  1479. }
  1480. } break;
  1481. default: type = LLM_TYPE_UNKNOWN;
  1482. }
  1483. } break;
  1484. case LLM_ARCH_GLM4:
  1485. {
  1486. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1487. switch (hparams.n_layer) {
  1488. case 40: type = LLM_TYPE_9B; break;
  1489. case 61: type = LLM_TYPE_32B; break;
  1490. default: type = LLM_TYPE_UNKNOWN;
  1491. }
  1492. } break;
  1493. case LLM_ARCH_GLM4_MOE:
  1494. {
  1495. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1496. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1497. // MoE parameters
  1498. ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
  1499. ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
  1500. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1501. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
  1502. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1503. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1504. // Expert gating function (GLM-4.5 uses sigmoid)
  1505. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1506. if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
  1507. hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
  1508. }
  1509. // NextN/MTP parameters
  1510. ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
  1511. // TODO: when MTP is implemented, this should probably be updated if needed
  1512. hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
  1513. switch (hparams.n_layer) {
  1514. case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
  1515. case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
  1516. default: type = LLM_TYPE_UNKNOWN;
  1517. }
  1518. } break;
  1519. case LLM_ARCH_BITNET:
  1520. {
  1521. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1522. switch (hparams.n_layer) {
  1523. case 26: type = LLM_TYPE_3B; break;
  1524. default: type = LLM_TYPE_UNKNOWN;
  1525. }
  1526. } break;
  1527. case LLM_ARCH_T5:
  1528. {
  1529. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1530. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  1531. uint32_t dec_start_token_id;
  1532. if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
  1533. hparams.dec_start_token_id = dec_start_token_id;
  1534. }
  1535. hparams.dec_n_layer = hparams.n_layer;
  1536. ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false);
  1537. switch (hparams.n_layer) {
  1538. case 6: type = LLM_TYPE_60M; break; // t5-small
  1539. case 8: type = LLM_TYPE_80M; break; // flan-t5-small
  1540. case 12:
  1541. switch (hparams.n_ff()) {
  1542. case 3072: type = LLM_TYPE_220M; break; // t5-base
  1543. case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
  1544. default: type = LLM_TYPE_UNKNOWN;
  1545. } break;
  1546. case 24:
  1547. switch (hparams.n_ff()) {
  1548. case 4096: type = LLM_TYPE_770M; break; // t5-large
  1549. case 2816: type = LLM_TYPE_780M; break; // flan-t5-large
  1550. case 16384: type = LLM_TYPE_3B; break; // t5-3b
  1551. case 5120: type = LLM_TYPE_3B; break; // flan-t5-xl
  1552. case 65536: type = LLM_TYPE_11B; break; // t5-11b
  1553. case 10240: type = LLM_TYPE_11B; break; // flan-t5-xxl
  1554. default: type = LLM_TYPE_UNKNOWN;
  1555. } break;
  1556. default: type = LLM_TYPE_UNKNOWN;
  1557. }
  1558. } break;
  1559. case LLM_ARCH_T5ENCODER:
  1560. {
  1561. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1562. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  1563. type = LLM_TYPE_UNKNOWN;
  1564. } break;
  1565. case LLM_ARCH_JAIS:
  1566. {
  1567. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1568. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  1569. switch (hparams.n_layer) {
  1570. case 24: type = LLM_TYPE_1_3B; break;
  1571. case 40: type = LLM_TYPE_13B; break;
  1572. /* TODO: add variants */
  1573. default: type = LLM_TYPE_UNKNOWN;
  1574. }
  1575. } break;
  1576. case LLM_ARCH_NEMOTRON:
  1577. {
  1578. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1579. switch (hparams.n_layer) {
  1580. case 32: type = LLM_TYPE_4B; break;
  1581. default: type = LLM_TYPE_UNKNOWN;
  1582. }
  1583. } break;
  1584. case LLM_ARCH_NEMOTRON_H:
  1585. {
  1586. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1587. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1588. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1589. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1590. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1591. // A layer is recurrent IFF the n_head_kv value is set to 0 and
  1592. // the n_ff value is set to 0
  1593. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1594. hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
  1595. }
  1596. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1597. switch (hparams.n_layer) {
  1598. case 56: type = LLM_TYPE_9B; break;
  1599. default: type = LLM_TYPE_UNKNOWN;
  1600. }
  1601. } break;
  1602. case LLM_ARCH_EXAONE:
  1603. {
  1604. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1605. switch (hparams.n_layer) {
  1606. case 32: type = LLM_TYPE_8B; break;
  1607. default: type = LLM_TYPE_UNKNOWN;
  1608. }
  1609. } break;
  1610. case LLM_ARCH_EXAONE4:
  1611. {
  1612. if (hparams.n_layer == 64) { // 32B
  1613. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1614. hparams.n_swa = 4096;
  1615. hparams.set_swa_pattern(4);
  1616. }
  1617. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1618. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1619. switch (hparams.n_layer) {
  1620. case 30: type = LLM_TYPE_1_2B; break;
  1621. case 64: type = LLM_TYPE_32B; break;
  1622. default: type = LLM_TYPE_UNKNOWN;
  1623. }
  1624. } break;
  1625. case LLM_ARCH_RWKV6:
  1626. case LLM_ARCH_RWKV6QWEN2:
  1627. {
  1628. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
  1629. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
  1630. ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
  1631. ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
  1632. ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
  1633. ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
  1634. ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
  1635. switch (hparams.n_layer) {
  1636. case 24: type = LLM_TYPE_1_6B; break;
  1637. case 32:
  1638. switch (hparams.n_embd) {
  1639. case 2560: type = LLM_TYPE_3B; break;
  1640. case 4096: type = LLM_TYPE_7B; break;
  1641. default: type = LLM_TYPE_UNKNOWN;
  1642. } break;
  1643. case 61: type = LLM_TYPE_14B; break;
  1644. case 64: type = LLM_TYPE_32B; break;
  1645. default: type = LLM_TYPE_UNKNOWN;
  1646. }
  1647. } break;
  1648. case LLM_ARCH_RWKV7:
  1649. case LLM_ARCH_ARWKV7:
  1650. {
  1651. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
  1652. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
  1653. ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
  1654. ml.get_key(LLM_KV_ATTENTION_DECAY_LORA_RANK, hparams.n_lora_decay);
  1655. ml.get_key(LLM_KV_ATTENTION_ICLR_LORA_RANK, hparams.n_lora_iclr);
  1656. ml.get_key(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix);
  1657. ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate, false);
  1658. ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
  1659. switch (hparams.n_layer) {
  1660. case 12:
  1661. switch (hparams.n_embd) {
  1662. case 768: type = LLM_TYPE_190M; break;
  1663. default: type = LLM_TYPE_UNKNOWN;
  1664. } break;
  1665. case 24:
  1666. switch (hparams.n_embd) {
  1667. case 1024: type = LLM_TYPE_450M; break;
  1668. case 2048: type = LLM_TYPE_1_5B; break;
  1669. default: type = LLM_TYPE_UNKNOWN;
  1670. } break;
  1671. case 28:
  1672. switch (hparams.n_embd) {
  1673. case 1536: type = LLM_TYPE_1_5B; break;
  1674. case 3584: type = LLM_TYPE_7B; break;
  1675. default: type = LLM_TYPE_UNKNOWN;
  1676. } break;
  1677. case 32:
  1678. switch (hparams.n_embd) {
  1679. case 2560: type = LLM_TYPE_2_9B; break;
  1680. case 4096: type = LLM_TYPE_7B; break;
  1681. default: type = LLM_TYPE_UNKNOWN;
  1682. } break;
  1683. case 61:
  1684. switch (hparams.n_embd) {
  1685. case 4096: type = LLM_TYPE_14B; break;
  1686. default: type = LLM_TYPE_UNKNOWN;
  1687. } break;
  1688. default: type = LLM_TYPE_UNKNOWN;
  1689. }
  1690. } break;
  1691. case LLM_ARCH_GRANITE:
  1692. case LLM_ARCH_GRANITE_MOE:
  1693. {
  1694. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1695. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1696. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  1697. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  1698. ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
  1699. // Granite uses rope_finetuned as a switch for rope, so default to true
  1700. bool rope_finetuned = true;
  1701. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  1702. hparams.rope_finetuned = rope_finetuned;
  1703. switch (hparams.n_layer) {
  1704. case 32: type = LLM_TYPE_3B; break;
  1705. case 40: type = LLM_TYPE_3B; break;
  1706. // Add additional layer/vocab/etc checks here for other model sizes
  1707. default: type = LLM_TYPE_UNKNOWN;
  1708. }
  1709. // For Granite MoE Shared
  1710. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
  1711. } break;
  1712. case LLM_ARCH_GRANITE_HYBRID:
  1713. {
  1714. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1715. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /* required */ false);
  1716. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /* required */ false);
  1717. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /* required */ false);
  1718. ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, /* required */ false);
  1719. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1720. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1721. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1722. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1723. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1724. // Granite uses rope_finetuned as a switch for rope, so default to true
  1725. bool rope_finetuned = true;
  1726. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  1727. hparams.rope_finetuned = rope_finetuned;
  1728. // A layer is recurrent IFF the n_head_kv value is set to 0
  1729. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1730. hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
  1731. }
  1732. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1733. switch (hparams.n_layer) {
  1734. // TODO: Add llm type label (not sure this is useful)
  1735. default: type = LLM_TYPE_UNKNOWN;
  1736. }
  1737. // For Granite MoE Shared
  1738. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
  1739. } break;
  1740. case LLM_ARCH_QWEN3NEXT:
  1741. {
  1742. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  1743. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  1744. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1745. // Load linear attention (gated delta net) parameters
  1746. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1747. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1748. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1749. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1750. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1751. // Mark recurrent layers (linear attention layers)
  1752. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1753. hparams.recurrent_layer_arr[i] = ((i + 1) % 4 != 0); // TODO: extract the magic 4 from "full_attention_interval"
  1754. }
  1755. switch (hparams.n_layer) {
  1756. case 80: type = LLM_TYPE_80B_A3B; break;
  1757. default: type = LLM_TYPE_UNKNOWN;
  1758. }
  1759. } break;
  1760. case LLM_ARCH_CHAMELEON:
  1761. {
  1762. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1763. hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
  1764. ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
  1765. switch (hparams.n_layer) {
  1766. case 32: type = LLM_TYPE_7B; break;
  1767. case 48: type = LLM_TYPE_34B; break;
  1768. default: type = LLM_TYPE_UNKNOWN;
  1769. }
  1770. } break;
  1771. case LLM_ARCH_WAVTOKENIZER_DEC:
  1772. {
  1773. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1774. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
  1775. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
  1776. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  1777. } break;
  1778. case LLM_ARCH_BAILINGMOE:
  1779. {
  1780. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1781. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1782. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1783. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1784. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1785. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1786. switch (hparams.n_layer) {
  1787. case 28: type = LLM_TYPE_16B; break;
  1788. case 88: type = LLM_TYPE_290B; break;
  1789. default: type = LLM_TYPE_UNKNOWN;
  1790. }
  1791. } break;
  1792. case LLM_ARCH_DOTS1:
  1793. {
  1794. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1795. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1796. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1797. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1798. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1799. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1800. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1801. switch (hparams.n_layer) {
  1802. case 62: type = LLM_TYPE_142B; break;
  1803. default: type = LLM_TYPE_UNKNOWN;
  1804. }
  1805. } break;
  1806. case LLM_ARCH_ERNIE4_5:
  1807. case LLM_ARCH_ERNIE4_5_MOE:
  1808. {
  1809. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1810. if (arch == LLM_ARCH_ERNIE4_5_MOE) {
  1811. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1812. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  1813. ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
  1814. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1815. }
  1816. switch (hparams.n_layer) {
  1817. case 18: type = LLM_TYPE_0_3B; break;
  1818. case 28: type = LLM_TYPE_21B_A3B; break;
  1819. case 54: type = LLM_TYPE_300B_A47B; break;
  1820. default: type = LLM_TYPE_UNKNOWN;
  1821. }
  1822. } break;
  1823. case LLM_ARCH_FALCON_H1:
  1824. {
  1825. // Common parameters
  1826. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1827. // SSM parameters
  1828. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1829. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1830. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1831. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1832. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1833. std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
  1834. switch (hparams.n_layer) {
  1835. case 36:
  1836. type = LLM_TYPE_0_5B; break;
  1837. case 24:
  1838. type = LLM_TYPE_1_5B; break;
  1839. case 66:
  1840. type = LLM_TYPE_1B; break;
  1841. case 32:
  1842. type = LLM_TYPE_3B; break;
  1843. case 44:
  1844. type = LLM_TYPE_7B; break;
  1845. case 72:
  1846. type = LLM_TYPE_34B; break;
  1847. default:
  1848. type = LLM_TYPE_UNKNOWN;
  1849. }
  1850. } break;
  1851. case LLM_ARCH_HUNYUAN_MOE:
  1852. {
  1853. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1854. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1855. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
  1856. switch (hparams.n_layer) {
  1857. case 32: type = LLM_TYPE_A13B; break;
  1858. default: type = LLM_TYPE_UNKNOWN;
  1859. }
  1860. } break;
  1861. case LLM_ARCH_HUNYUAN_DENSE:
  1862. {
  1863. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1864. switch (hparams.n_embd) {
  1865. case 1024: type = LLM_TYPE_0_5B; break;
  1866. case 2048: type = LLM_TYPE_1_8B; break;
  1867. case 3072: type = LLM_TYPE_4B; break;
  1868. case 4096: type = LLM_TYPE_7B; break;
  1869. default: type = LLM_TYPE_UNKNOWN;
  1870. }
  1871. } break;
  1872. case LLM_ARCH_SMOLLM3:
  1873. {
  1874. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1875. hparams.n_no_rope_layer_step = 4;
  1876. switch (hparams.n_layer) {
  1877. case 36: type = LLM_TYPE_3B; break;
  1878. default: type = LLM_TYPE_UNKNOWN;
  1879. }
  1880. } break;
  1881. case LLM_ARCH_OPENAI_MOE:
  1882. {
  1883. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1884. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1885. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1886. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1887. hparams.set_swa_pattern(2);
  1888. switch (hparams.n_layer) {
  1889. case 24: type = LLM_TYPE_20B; break;
  1890. case 36: type = LLM_TYPE_120B; break;
  1891. default: type = LLM_TYPE_UNKNOWN;
  1892. }
  1893. } break;
  1894. case LLM_ARCH_LFM2:
  1895. {
  1896. ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
  1897. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1898. for (uint32_t il = 0; il < hparams.n_layer; ++il) {
  1899. hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
  1900. }
  1901. switch (hparams.n_embd) {
  1902. case 1024: type = LLM_TYPE_350M; break;
  1903. case 1536: type = LLM_TYPE_700M; break;
  1904. case 2048: type = LLM_TYPE_1_2B; break;
  1905. default: type = LLM_TYPE_UNKNOWN;
  1906. }
  1907. } break;
  1908. case LLM_ARCH_SMALLTHINKER:
  1909. {
  1910. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1911. if (found_swa && hparams.n_swa > 0) {
  1912. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1913. hparams.n_swa = 4096;
  1914. hparams.set_swa_pattern(4, true);
  1915. } else {
  1916. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  1917. hparams.n_no_rope_layer_step = hparams.n_layer;
  1918. }
  1919. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  1920. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1921. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1922. switch (hparams.n_layer) {
  1923. case 32: type = LLM_TYPE_4B; break;
  1924. case 52: type = LLM_TYPE_20B; break;
  1925. default: type = LLM_TYPE_UNKNOWN;
  1926. }
  1927. } break;
  1928. default: throw std::runtime_error("unsupported model architecture");
  1929. }
  1930. pimpl->n_bytes = ml.n_bytes;
  1931. pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
  1932. if (hparams.f_max_alibi_bias > 0.0f) {
  1933. hparams.use_alibi = true;
  1934. }
  1935. hparams.rope_type = llama_model_rope_type(this);
  1936. }
  1937. void llama_model::load_vocab(llama_model_loader & ml) {
  1938. const auto kv = LLM_KV(arch);
  1939. vocab.load(ml, kv);
  1940. }
  1941. bool llama_model::load_tensors(llama_model_loader & ml) {
  1942. const auto & split_mode = params.split_mode;
  1943. const auto & n_gpu_layers = params.n_gpu_layers;
  1944. const auto & use_mlock = params.use_mlock;
  1945. const auto & tensor_split = params.tensor_split;
  1946. const int n_layer = hparams.n_layer;
  1947. const bool use_mmap_buffer = true;
  1948. LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
  1949. // build a list of buffer types for the CPU and GPU devices
  1950. pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts);
  1951. for (auto * dev : devices) {
  1952. buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
  1953. // add CPU buffer types as a fallback
  1954. buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
  1955. pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
  1956. }
  1957. // calculate the split points
  1958. bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
  1959. std::vector<float> splits(n_devices());
  1960. if (all_zero) {
  1961. // default split, by free memory
  1962. for (size_t i = 0; i < n_devices(); ++i) {
  1963. ggml_backend_dev_t dev = devices[i];
  1964. size_t total;
  1965. size_t free;
  1966. ggml_backend_dev_memory(dev, &free, &total);
  1967. splits[i] = free;
  1968. }
  1969. } else {
  1970. std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
  1971. }
  1972. // sum and normalize the splits to get the split points
  1973. float split_sum = 0.0f;
  1974. for (size_t i = 0; i < n_devices(); ++i) {
  1975. split_sum += splits[i];
  1976. splits[i] = split_sum;
  1977. }
  1978. for (size_t i = 0; i < n_devices(); ++i) {
  1979. splits[i] /= split_sum;
  1980. }
  1981. ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  1982. if (cpu_dev == nullptr) {
  1983. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  1984. }
  1985. const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
  1986. const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
  1987. auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
  1988. const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
  1989. if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
  1990. LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
  1991. return {cpu_dev, &pimpl->cpu_buft_list};
  1992. }
  1993. const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
  1994. auto * dev = devices.at(layer_gpu);
  1995. LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
  1996. return {dev, &pimpl->gpu_buft_list.at(dev)};
  1997. };
  1998. // assign the input layer
  1999. // there is very little benefit to offloading the input layer, so always keep it on the CPU
  2000. pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
  2001. // assign the repeating layers to the devices according to the splits
  2002. pimpl->dev_layer.resize(n_layer);
  2003. for (int il = 0; il < n_layer; ++il) {
  2004. pimpl->dev_layer[il] = get_layer_buft_list(il);
  2005. }
  2006. // assign the output layer
  2007. pimpl->dev_output = get_layer_buft_list(n_layer);
  2008. // one ggml context per buffer type
  2009. int max_n_tensors = ml.n_tensors;
  2010. max_n_tensors += 1; // duplicated output tensor
  2011. max_n_tensors += n_layer*2; // duplicated rope freq tensors
  2012. const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
  2013. std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
  2014. auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
  2015. auto it = ctx_map.find(buft);
  2016. if (it == ctx_map.end()) {
  2017. ggml_init_params params = {
  2018. /*.mem_size =*/ ctx_size,
  2019. /*.mem_buffer =*/ NULL,
  2020. /*.no_alloc =*/ true,
  2021. };
  2022. ggml_context * ctx = ggml_init(params);
  2023. if (!ctx) {
  2024. throw std::runtime_error(format("failed to create ggml context"));
  2025. }
  2026. ctx_map[buft] = ctx;
  2027. pimpl->ctxs.emplace_back(ctx);
  2028. return ctx;
  2029. }
  2030. return it->second;
  2031. };
  2032. const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
  2033. const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
  2034. const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
  2035. // create tensors for the weights
  2036. {
  2037. // note: cast to int64_t since we will use these for the tensor dimensions
  2038. const int64_t n_head = hparams.n_head();
  2039. const int64_t n_head_kv = hparams.n_head_kv();
  2040. const int64_t n_embd = hparams.n_embd;
  2041. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
  2042. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
  2043. const int64_t n_embd_head_k = hparams.n_embd_head_k;
  2044. const int64_t n_embd_head_v = hparams.n_embd_head_v;
  2045. const int64_t n_ff = hparams.n_ff();
  2046. const int64_t n_embd_gqa = n_embd_v_gqa;
  2047. const int64_t n_vocab = vocab.n_tokens();
  2048. const int64_t n_token_types = vocab.n_token_types();
  2049. const int64_t n_rot = hparams.n_rot;
  2050. const int64_t n_expert = hparams.n_expert;
  2051. const int64_t n_expert_used = hparams.n_expert_used;
  2052. const int64_t n_ctx_train = hparams.n_ctx_train;
  2053. if (n_expert > 0 && hparams.n_expert_used == 0) {
  2054. throw std::runtime_error("model has expert layers but no expert layers are used");
  2055. }
  2056. int n_moved_tensors = 0;
  2057. ggml_tensor * first_moved_tensor = nullptr;
  2058. ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
  2059. ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
  2060. auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
  2061. ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
  2062. if (!t_meta) {
  2063. if (flags & TENSOR_NOT_REQUIRED) {
  2064. return nullptr;
  2065. }
  2066. throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
  2067. }
  2068. // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
  2069. // the tensor is duplicated
  2070. // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
  2071. llm_tensor tn_tensor = tn.tensor;
  2072. if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
  2073. tn_tensor = LLM_TENSOR_OUTPUT;
  2074. }
  2075. llm_tensor_info info;
  2076. try {
  2077. info = llm_tensor_info_for(tn_tensor);
  2078. } catch (const std::out_of_range & e) {
  2079. throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
  2080. }
  2081. // skip unused tensors
  2082. if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
  2083. const size_t nbytes = ggml_nbytes(t_meta);
  2084. LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
  2085. ml.size_data -= nbytes;
  2086. ml.n_created++;
  2087. return nullptr;
  2088. }
  2089. // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
  2090. ggml_op op;
  2091. bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
  2092. if (bias) {
  2093. if (info.op == GGML_OP_MUL_MAT_ID) {
  2094. op = GGML_OP_ADD_ID;
  2095. } else {
  2096. op = GGML_OP_ADD;
  2097. }
  2098. } else {
  2099. op = info.op;
  2100. }
  2101. // sanity checks
  2102. if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
  2103. if (tn.bid != -1) {
  2104. GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
  2105. }
  2106. } else {
  2107. if (tn.bid == -1) {
  2108. GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
  2109. }
  2110. }
  2111. // select the buffer type for this tensor
  2112. buft_list_t * buft_list;
  2113. switch (info.layer) {
  2114. case LLM_TENSOR_LAYER_INPUT:
  2115. buft_list = pimpl->dev_input.buft_list;
  2116. break;
  2117. case LLM_TENSOR_LAYER_OUTPUT:
  2118. buft_list = pimpl->dev_output.buft_list;
  2119. break;
  2120. case LLM_TENSOR_LAYER_REPEATING:
  2121. buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
  2122. break;
  2123. default:
  2124. GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
  2125. }
  2126. ggml_backend_buffer_type_t buft = nullptr;
  2127. // check overrides
  2128. if (ml.tensor_buft_overrides) {
  2129. std::string tensor_name = tn.str();
  2130. for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
  2131. std::regex pattern(overrides->pattern);
  2132. if (std::regex_search(tensor_name, pattern)) {
  2133. if (overrides->buft == ggml_backend_cpu_buffer_type()) {
  2134. // when overriding to a CPU buffer, consider the extra buffer types
  2135. buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
  2136. } else {
  2137. buft = overrides->buft;
  2138. }
  2139. LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
  2140. tensor_name.c_str(),
  2141. ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
  2142. ggml_backend_buft_name(buft));
  2143. break;
  2144. }
  2145. }
  2146. }
  2147. if (!buft) {
  2148. buft = select_weight_buft(hparams, t_meta, op, *buft_list);
  2149. if (!buft) {
  2150. throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
  2151. }
  2152. }
  2153. // avoid using a host buffer when using mmap
  2154. auto * buft_dev = ggml_backend_buft_get_device(buft);
  2155. if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
  2156. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  2157. if (!cpu_dev) {
  2158. throw std::runtime_error("no CPU backend found");
  2159. }
  2160. buft = ggml_backend_dev_buffer_type(cpu_dev);
  2161. }
  2162. if (buft != buft_list->front().second) {
  2163. n_moved_tensors++;
  2164. if (!first_moved_tensor) {
  2165. first_moved_tensor = t_meta;
  2166. first_moved_from_buft = buft_list->front().second;
  2167. first_moved_to_buft = buft;
  2168. }
  2169. }
  2170. ggml_context * ctx = ctx_for_buft(buft);
  2171. // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
  2172. if (flags & TENSOR_DUPLICATED) {
  2173. ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
  2174. if (t) {
  2175. return t;
  2176. }
  2177. }
  2178. return ml.create_tensor(ctx, tn, ne, flags);
  2179. };
  2180. layers.resize(n_layer);
  2181. // TODO: move to a separate function
  2182. const auto tn = LLM_TN(arch);
  2183. switch (arch) {
  2184. case LLM_ARCH_LLAMA:
  2185. case LLM_ARCH_REFACT:
  2186. case LLM_ARCH_MINICPM:
  2187. case LLM_ARCH_GRANITE:
  2188. case LLM_ARCH_GRANITE_MOE:
  2189. {
  2190. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2191. // output
  2192. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2193. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2194. // if output is NULL, init from the input tok embed
  2195. if (output == NULL) {
  2196. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2197. }
  2198. for (int i = 0; i < n_layer; ++i) {
  2199. auto & layer = layers[i];
  2200. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2201. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2202. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2203. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2204. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2205. // optional bias tensors
  2206. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2207. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2208. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2209. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2210. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2211. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  2212. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2213. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2214. }
  2215. else {
  2216. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2217. }
  2218. if (n_expert == 0) {
  2219. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2220. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2221. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2222. // optional MLP bias
  2223. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2224. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2225. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2226. } else {
  2227. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2228. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
  2229. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  2230. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2231. // For Granite MoE Shared
  2232. if (hparams.n_ff_shexp > 0) {
  2233. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  2234. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  2235. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
  2236. }
  2237. }
  2238. }
  2239. } break;
  2240. case LLM_ARCH_QWEN3NEXT:
  2241. {
  2242. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2243. // output
  2244. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2245. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
  2246. // if output is NULL, init from the input tok embed
  2247. if (output == NULL) {
  2248. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
  2249. }
  2250. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2251. // Calculate dimensions from hyperparameters
  2252. const int64_t head_k_dim = hparams.ssm_d_state;
  2253. const int64_t head_v_dim = hparams.ssm_d_state;
  2254. const int64_t n_k_heads = hparams.ssm_n_group;
  2255. const int64_t n_v_heads = hparams.ssm_dt_rank;
  2256. const int64_t key_dim = head_k_dim * n_k_heads;
  2257. const int64_t value_dim = head_v_dim * n_v_heads;
  2258. const int64_t conv_dim = key_dim * 2 + value_dim;
  2259. // Calculate projection sizes
  2260. const int64_t qkvz_projection_size = key_dim * 2 + value_dim * 2;
  2261. const int64_t ba_projection_size = n_v_heads * 2;
  2262. for (int i = 0; i < n_layer; ++i) {
  2263. auto & layer = layers[i];
  2264. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  2265. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
  2266. if (!hparams.is_recurrent(i)) {
  2267. // Attention layers
  2268. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
  2269. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
  2270. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
  2271. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
  2272. // Q/K normalization for attention layers
  2273. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
  2274. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
  2275. } else {
  2276. // Linear attention (gated delta net) specific tensors
  2277. // Create tensors with calculated dimensions
  2278. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_projection_size }, 0);
  2279. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
  2280. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
  2281. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), { hparams.ssm_dt_rank }, 0);
  2282. layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_projection_size }, 0);
  2283. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
  2284. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
  2285. }
  2286. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
  2287. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
  2288. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
  2289. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
  2290. // Shared experts
  2291. layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
  2292. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
  2293. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
  2294. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
  2295. }
  2296. }
  2297. break;
  2298. case LLM_ARCH_LLADA:
  2299. {
  2300. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2301. // output
  2302. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2303. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
  2304. // if output is NULL, init from the input tok embed
  2305. if (output == NULL) {
  2306. output =
  2307. create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
  2308. }
  2309. for (int i = 0; i < n_layer; ++i) {
  2310. auto & layer = layers[i];
  2311. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  2312. // Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
  2313. layer.wq =
  2314. create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
  2315. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
  2316. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
  2317. // No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
  2318. layer.wo =
  2319. create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
  2320. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
  2321. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  2322. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
  2323. TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2324. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
  2325. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  2326. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
  2327. // optional MLP bias
  2328. layer.ffn_gate_b =
  2329. create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
  2330. layer.ffn_down_b =
  2331. create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
  2332. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
  2333. }
  2334. }
  2335. break;
  2336. case LLM_ARCH_LLADA_MOE:
  2337. {
  2338. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2339. // output
  2340. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2341. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2342. GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe");
  2343. GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe");
  2344. for (int i = 0; i < n_layer; ++i) {
  2345. auto & layer = layers[i];
  2346. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2347. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2348. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2349. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2350. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2351. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2352. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2353. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2354. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2355. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2356. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2357. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2358. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2359. }
  2360. } break;
  2361. case LLM_ARCH_LLAMA4:
  2362. {
  2363. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2364. // output
  2365. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2366. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2367. // if output is NULL, init from the input tok embed
  2368. if (output == NULL) {
  2369. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2370. }
  2371. for (int i = 0; i < n_layer; ++i) {
  2372. bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
  2373. auto & layer = layers[i];
  2374. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2375. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2376. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2377. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2378. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2379. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2380. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2381. if (is_moe_layer) {
  2382. int n_ff_exp = hparams.n_ff_exp;
  2383. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2384. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  2385. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
  2386. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  2387. // Shared expert
  2388. const int64_t n_ff_shexp = n_ff_exp;
  2389. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2390. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
  2391. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2392. } else {
  2393. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2394. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2395. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2396. }
  2397. }
  2398. } break;
  2399. case LLM_ARCH_DECI:
  2400. {
  2401. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2402. // output
  2403. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2404. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2405. // if output is NULL, init from the input tok embed
  2406. if (output == NULL) {
  2407. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2408. }
  2409. for (int i = 0; i < n_layer; ++i) {
  2410. auto & layer = layers[i];
  2411. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
  2412. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
  2413. const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
  2414. const int64_t n_ff = hparams.n_ff(i);
  2415. const int64_t n_head = hparams.n_head(i);
  2416. const int64_t n_head_kv = hparams.n_head_kv(i);
  2417. if (n_head_kv == 0 && n_head > 0) {
  2418. // linear attention for DeciLMCausalModel
  2419. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2420. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2421. }
  2422. else if (n_head_kv > 0) {
  2423. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2424. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2425. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2426. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2427. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2428. }
  2429. // optional bias tensors
  2430. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2431. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2432. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2433. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2434. if (n_ff > 0) {
  2435. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2436. }
  2437. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  2438. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2439. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2440. }
  2441. else {
  2442. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2443. }
  2444. if (n_ff > 0) {
  2445. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2446. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2447. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2448. }
  2449. // optional MLP bias
  2450. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2451. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2452. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2453. }
  2454. } break;
  2455. case LLM_ARCH_MINICPM3:
  2456. {
  2457. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  2458. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  2459. const int64_t q_lora_rank = hparams.n_lora_q;
  2460. const int64_t kv_lora_rank = hparams.n_lora_kv;
  2461. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2462. // output
  2463. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2464. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2465. // if output is NULL, init from the input tok embed
  2466. if (output == NULL) {
  2467. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2468. }
  2469. for (int i = 0; i < n_layer; ++i) {
  2470. auto & layer = layers[i];
  2471. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2472. layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
  2473. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  2474. layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
  2475. layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
  2476. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
  2477. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
  2478. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
  2479. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2480. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2481. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2482. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2483. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2484. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2485. }
  2486. } break;
  2487. case LLM_ARCH_GROK:
  2488. {
  2489. if (n_expert == 0) {
  2490. throw std::runtime_error("Grok model cannot have zero experts");
  2491. }
  2492. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2493. // output
  2494. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2495. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2496. // if output is NULL, init from the input tok embed
  2497. if (output == NULL) {
  2498. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2499. }
  2500. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff/* / n_expert_used*/; // grok-1 n_ff_exp == n_ff
  2501. for (int i = 0; i < n_layer; ++i) {
  2502. auto & layer = layers[i];
  2503. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2504. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2505. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2506. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2507. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2508. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  2509. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2510. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2511. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, TENSOR_NOT_REQUIRED);
  2512. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2513. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2514. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
  2515. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2516. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  2517. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2518. if (!layer.ffn_post_norm) {
  2519. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  2520. }
  2521. }
  2522. } break;
  2523. case LLM_ARCH_DBRX:
  2524. {
  2525. if (n_expert == 0) {
  2526. throw std::runtime_error("DBRX model cannot have zero experts");
  2527. }
  2528. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2529. // output
  2530. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2531. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2532. for (int i = 0; i < n_layer; ++i) {
  2533. auto & layer = layers[i];
  2534. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2535. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2536. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2537. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  2538. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2539. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2540. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  2541. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2542. }
  2543. } break;
  2544. case LLM_ARCH_BAICHUAN:
  2545. {
  2546. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2547. {
  2548. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2549. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2550. }
  2551. for (int i = 0; i < n_layer; ++i) {
  2552. auto & layer = layers[i];
  2553. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2554. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2555. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2556. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2557. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2558. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2559. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2560. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2561. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2562. }
  2563. } break;
  2564. case LLM_ARCH_FALCON:
  2565. {
  2566. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2567. // output
  2568. {
  2569. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2570. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2571. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2572. if (!output) {
  2573. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
  2574. }
  2575. }
  2576. for (int i = 0; i < n_layer; ++i) {
  2577. auto & layer = layers[i];
  2578. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2579. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2580. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2581. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2582. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2583. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2584. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2585. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2586. }
  2587. } break;
  2588. case LLM_ARCH_STARCODER:
  2589. {
  2590. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2591. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  2592. // output
  2593. {
  2594. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2595. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2596. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2597. if (!output) {
  2598. // needs to be on GPU
  2599. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2600. }
  2601. }
  2602. for (int i = 0; i < n_layer; ++i) {
  2603. auto & layer = layers[i];
  2604. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2605. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2606. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2607. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2608. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2609. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2610. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2611. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2612. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2613. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2614. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2615. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2616. }
  2617. } break;
  2618. case LLM_ARCH_BERT:
  2619. case LLM_ARCH_NOMIC_BERT:
  2620. case LLM_ARCH_NOMIC_BERT_MOE:
  2621. case LLM_ARCH_JINA_BERT_V3:
  2622. {
  2623. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2624. type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
  2625. if (arch == LLM_ARCH_BERT) {
  2626. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  2627. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
  2628. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  2629. cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2630. cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2631. }
  2632. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  2633. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  2634. for (int i = 0; i < n_layer; ++i) {
  2635. auto & layer = layers[i];
  2636. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2637. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2638. if (!layer.wqkv) {
  2639. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2640. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2641. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2642. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2643. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2644. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2645. }
  2646. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2647. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2648. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  2649. layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
  2650. if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
  2651. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
  2652. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  2653. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2654. } else {
  2655. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2656. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2657. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2658. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2659. if (arch == LLM_ARCH_NOMIC_BERT) {
  2660. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2661. }
  2662. }
  2663. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  2664. layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
  2665. }
  2666. } break;
  2667. case LLM_ARCH_NEO_BERT:
  2668. {
  2669. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2670. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
  2671. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  2672. cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2673. cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2674. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2675. for (int i = 0; i < n_layer; ++i) {
  2676. auto & layer = layers[i];
  2677. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2678. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2679. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2680. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2681. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff*2}, 0);
  2682. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2683. }
  2684. } break;
  2685. case LLM_ARCH_JINA_BERT_V2:
  2686. {
  2687. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
  2688. type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
  2689. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
  2690. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); //LayerNorm bias
  2691. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
  2692. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {1}, TENSOR_NOT_REQUIRED);
  2693. for (int i = 0; i < n_layer; ++i) {
  2694. auto & layer = layers[i]; // JinaBertLayer
  2695. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2696. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2697. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2698. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2699. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2700. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2701. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2702. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2703. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2704. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2705. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
  2706. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
  2707. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
  2708. layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
  2709. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2710. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2711. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2712. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
  2713. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2714. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2715. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  2716. layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
  2717. }
  2718. } break;
  2719. case LLM_ARCH_BLOOM:
  2720. {
  2721. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2722. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  2723. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  2724. // output
  2725. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2726. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2727. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2728. // if output is NULL, init from the input tok embed
  2729. if (output == NULL) {
  2730. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2731. }
  2732. for (int i = 0; i < n_layer; ++i) {
  2733. auto & layer = layers[i];
  2734. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2735. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2736. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2737. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2738. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2739. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2740. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2741. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2742. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2743. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2744. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2745. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2746. }
  2747. } break;
  2748. case LLM_ARCH_MPT:
  2749. {
  2750. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2751. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
  2752. // output
  2753. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2754. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  2755. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2756. if (!output) {
  2757. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
  2758. }
  2759. for (int i = 0; i < n_layer; ++i) {
  2760. auto & layer = layers[i];
  2761. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2762. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2763. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2764. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2765. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2766. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2767. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2768. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2769. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2770. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2771. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2772. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2773. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2774. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2775. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2776. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2777. // AWQ ScaleActivation layer
  2778. layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2779. }
  2780. } break;
  2781. case LLM_ARCH_STABLELM:
  2782. {
  2783. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2784. // output
  2785. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2786. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2787. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2788. for (int i = 0; i < n_layer; ++i) {
  2789. auto & layer = layers[i];
  2790. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2791. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2792. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2793. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2794. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2795. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2796. // optional bias tensors, present in Stable LM 2 1.6B
  2797. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2798. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2799. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2800. // optional q and k layernorms, present in StableLM 2 12B
  2801. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
  2802. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
  2803. // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
  2804. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2805. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2806. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2807. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2808. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2809. }
  2810. } break;
  2811. case LLM_ARCH_QWEN:
  2812. {
  2813. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2814. // output
  2815. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2816. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2817. for (int i = 0; i < n_layer; ++i) {
  2818. auto & layer = layers[i];
  2819. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2820. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
  2821. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
  2822. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2823. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2824. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
  2825. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
  2826. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}, 0);
  2827. }
  2828. } break;
  2829. case LLM_ARCH_QWEN2:
  2830. case LLM_ARCH_QWEN2VL:
  2831. case LLM_ARCH_DREAM:
  2832. {
  2833. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2834. // output
  2835. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2836. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2837. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, TENSOR_NOT_REQUIRED);
  2838. // if output is NULL, init from the input tok embed
  2839. if (output == NULL) {
  2840. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2841. }
  2842. for (int i = 0; i < n_layer; ++i) {
  2843. auto & layer = layers[i];
  2844. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2845. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2846. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2847. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2848. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2849. // optional bias tensors
  2850. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2851. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2852. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2853. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2854. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2855. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2856. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2857. }
  2858. } break;
  2859. case LLM_ARCH_QWEN2MOE:
  2860. {
  2861. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2862. // output
  2863. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2864. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2865. for (int i = 0; i < n_layer; ++i) {
  2866. auto & layer = layers[i];
  2867. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2868. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2869. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2870. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2871. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2872. // optional bias tensors
  2873. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2874. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2875. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2876. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2877. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2878. if (n_expert == 0) {
  2879. throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
  2880. }
  2881. if (n_expert_used == 0) {
  2882. throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
  2883. }
  2884. // MoE branch
  2885. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2886. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2887. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2888. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2889. // Shared expert branch
  2890. const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
  2891. layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
  2892. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2893. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
  2894. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2895. }
  2896. } break;
  2897. case LLM_ARCH_QWEN3:
  2898. {
  2899. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2900. // output
  2901. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2902. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2903. // if output is NULL, init from the input tok embed
  2904. if (output == NULL) {
  2905. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2906. }
  2907. for (int i = 0; i < n_layer; ++i) {
  2908. auto & layer = layers[i];
  2909. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2910. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2911. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2912. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2913. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2914. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2915. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2916. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2917. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2918. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2919. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2920. }
  2921. } break;
  2922. case LLM_ARCH_QWEN3MOE:
  2923. {
  2924. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2925. // output
  2926. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2927. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2928. // if output is NULL, init from the input tok embed
  2929. if (output == NULL) {
  2930. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2931. }
  2932. for (int i = 0; i < n_layer; ++i) {
  2933. auto & layer = layers[i];
  2934. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2935. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2936. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2937. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2938. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2939. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2940. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2941. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2942. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2943. if (n_expert == 0) {
  2944. throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
  2945. }
  2946. if (n_expert_used == 0) {
  2947. throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
  2948. }
  2949. // MoE branch
  2950. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2951. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2952. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2953. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2954. }
  2955. } break;
  2956. case LLM_ARCH_PHI2:
  2957. {
  2958. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2959. // output
  2960. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2961. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2962. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2963. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, 0);
  2964. for (int i = 0; i < n_layer; ++i) {
  2965. auto & layer = layers[i];
  2966. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2967. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2968. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2969. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2970. if (layer.wqkv == nullptr) {
  2971. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2972. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2973. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2974. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2975. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2976. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2977. }
  2978. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2979. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2980. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2981. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2982. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2983. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2984. }
  2985. } break;
  2986. case LLM_ARCH_PHI3:
  2987. {
  2988. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2989. // output
  2990. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2991. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2992. // if output is NULL, init from the input tok embed
  2993. if (output == NULL) {
  2994. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2995. }
  2996. for (int i = 0; i < n_layer; ++i) {
  2997. auto & layer = layers[i];
  2998. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  2999. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
  3000. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  3001. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  3002. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  3003. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
  3004. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3005. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3006. }
  3007. } break;
  3008. case LLM_ARCH_PHIMOE:
  3009. {
  3010. const int64_t n_embd_head = n_embd / n_head;
  3011. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  3012. // output
  3013. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  3014. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3015. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
  3016. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0);
  3017. for (int i = 0; i < n_layer; ++i) {
  3018. auto & layer = layers[i];
  3019. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  3020. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
  3021. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
  3022. if (layer.wqkv == nullptr) {
  3023. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3024. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  3025. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3026. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  3027. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3028. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  3029. }
  3030. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  3031. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
  3032. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  3033. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
  3034. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3035. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3036. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  3037. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3038. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3039. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3040. }
  3041. } break;
  3042. case LLM_ARCH_PLAMO:
  3043. {
  3044. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3045. // output
  3046. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3047. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3048. for (int i = 0; i < n_layer; ++i) {
  3049. auto & layer = layers[i];
  3050. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3051. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3052. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3053. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3054. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3055. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3056. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3057. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3058. }
  3059. } break;
  3060. case LLM_ARCH_PLAMO2:
  3061. {
  3062. const uint32_t d_conv = hparams.ssm_d_conv;
  3063. const uint32_t d_state = hparams.ssm_d_state;
  3064. const uint32_t num_heads = hparams.ssm_dt_rank;
  3065. const uint32_t intermediate_size = hparams.ssm_d_inner;
  3066. const uint32_t head_dim = intermediate_size / num_heads;
  3067. const uint32_t qk_dim = head_dim;
  3068. const uint32_t v_dim = head_dim;
  3069. const int64_t num_attention_heads = hparams.n_head();
  3070. const int64_t q_num_heads = num_attention_heads;
  3071. const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
  3072. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3073. // output
  3074. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3075. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3076. // if output is NULL, init from the input tok embed
  3077. if (output == NULL) {
  3078. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3079. }
  3080. for (int i = 0; i < n_layer; ++i) {
  3081. auto & layer = layers[i];
  3082. bool is_mamba_layer = hparams.is_recurrent(i);
  3083. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3084. if (is_mamba_layer) {
  3085. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2 * intermediate_size}, 0);
  3086. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, intermediate_size}, 0);
  3087. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {intermediate_size, dt_dim + 2*d_state}, 0);
  3088. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_dim, num_heads}, 0);
  3089. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {num_heads}, 0);
  3090. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {num_heads}, 0);
  3091. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {num_heads}, 0);
  3092. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {intermediate_size, n_embd}, 0);
  3093. layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, i), {dt_dim}, 0);
  3094. layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
  3095. layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
  3096. } else {
  3097. const int64_t num_key_value_heads = hparams.n_head_kv(i);
  3098. const int64_t k_num_heads = num_key_value_heads;
  3099. const int64_t v_num_heads = num_key_value_heads;
  3100. const int64_t q_proj_dim = q_num_heads * qk_dim;
  3101. const int64_t k_proj_dim = k_num_heads * qk_dim;
  3102. const int64_t v_proj_dim = v_num_heads * v_dim;
  3103. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
  3104. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim, num_attention_heads}, 0);
  3105. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim, k_num_heads}, 0);
  3106. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
  3107. }
  3108. // All layers have post-attention norm, FFN norm, and FFN tensors
  3109. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
  3110. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3111. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3112. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  3113. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
  3114. }
  3115. } break;
  3116. case LLM_ARCH_GPT2:
  3117. {
  3118. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3119. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  3120. // output
  3121. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3122. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3123. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3124. // if output is NULL, init from the input tok embed
  3125. if (output == NULL) {
  3126. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3127. }
  3128. for (int i = 0; i < n_layer; ++i) {
  3129. auto & layer = layers[i];
  3130. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3131. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3132. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3133. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  3134. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3135. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3136. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3137. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3138. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3139. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3140. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3141. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  3142. }
  3143. } break;
  3144. case LLM_ARCH_CODESHELL:
  3145. {
  3146. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3147. // if tok embd is NULL, init from output
  3148. if (tok_embd == NULL) {
  3149. tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3150. }
  3151. // output
  3152. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3153. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3154. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3155. for (int i = 0; i < n_layer; ++i) {
  3156. auto & layer = layers[i];
  3157. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3158. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3159. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3160. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  3161. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3162. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3163. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3164. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3165. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3166. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3167. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3168. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  3169. }
  3170. } break;
  3171. case LLM_ARCH_ORION:
  3172. {
  3173. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3174. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3175. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3176. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3177. for (int i = 0; i < n_layer; ++i) {
  3178. auto & layer = layers[i];
  3179. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3180. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3181. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3182. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3183. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3184. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3185. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3186. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3187. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3188. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3189. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3190. }
  3191. } break;
  3192. case LLM_ARCH_INTERNLM2:
  3193. {
  3194. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3195. // output
  3196. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3197. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3198. for (int i = 0; i < n_layer; ++i) {
  3199. auto & layer = layers[i];
  3200. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3201. // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3202. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3203. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3204. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3205. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3206. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3207. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3208. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3209. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3210. }
  3211. } break;
  3212. case LLM_ARCH_GEMMA:
  3213. {
  3214. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3215. // output
  3216. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3217. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  3218. for (int i = 0; i < n_layer; ++i) {
  3219. auto & layer = layers[i];
  3220. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3221. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3222. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3223. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3224. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3225. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3226. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3227. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3228. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3229. }
  3230. } break;
  3231. case LLM_ARCH_GEMMA2:
  3232. {
  3233. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3234. // output
  3235. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3236. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  3237. for (int i = 0; i < n_layer; ++i) {
  3238. auto & layer = layers[i];
  3239. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3240. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3241. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3242. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3243. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3244. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3245. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3246. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3247. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3248. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3249. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3250. }
  3251. } break;
  3252. case LLM_ARCH_GEMMA3:
  3253. case LLM_ARCH_GEMMA_EMBEDDING:
  3254. {
  3255. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3256. // output
  3257. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3258. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3259. // if output is NULL, init from the input tok embed
  3260. if (output == NULL) {
  3261. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3262. }
  3263. for (int i = 0; i < n_layer; ++i) {
  3264. auto & layer = layers[i];
  3265. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3266. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3267. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3268. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3269. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3270. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3271. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  3272. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  3273. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3274. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3275. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3276. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3277. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3278. }
  3279. } break;
  3280. case LLM_ARCH_GEMMA3N:
  3281. {
  3282. const int64_t n_altup = hparams.n_altup;
  3283. const int64_t laurel_rank = hparams.laurel_rank;
  3284. const int64_t n_embd_altup = hparams.n_embd_altup;
  3285. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3286. // if output is NULL, init from the input tok embed
  3287. if (output == NULL) {
  3288. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3289. }
  3290. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3291. tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0);
  3292. altup_proj = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
  3293. altup_unembd_proj = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
  3294. per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_altup * n_layer}, 0);
  3295. per_layer_proj_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM, "weight"), {n_embd_altup}, 0);
  3296. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3297. for (int i = 0; i < n_layer; ++i) {
  3298. auto & layer = layers[i];
  3299. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3300. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3301. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3302. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3303. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3304. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  3305. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  3306. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3307. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3308. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3309. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3310. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3311. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3312. // altup & laurel
  3313. layer.per_layer_inp_gate = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE, "weight", i), {n_embd, n_embd_altup}, 0);
  3314. layer.per_layer_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ, "weight", i), {n_embd_altup, n_embd}, 0);
  3315. layer.per_layer_post_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
  3316. layer.altup_correct_coef = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF, "weight", i), {n_altup, n_altup}, 0);
  3317. layer.altup_correct_scale = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0);
  3318. layer.altup_predict_coef = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF, "weight", i), {n_altup, n_altup * n_altup}, 0);
  3319. layer.altup_router = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER, "weight", i), {n_embd, n_altup}, 0);
  3320. layer.altup_router_norm = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM, "weight", i), {n_embd}, 0);
  3321. layer.laurel_l = create_tensor(tn(LLM_TENSOR_LAUREL_L, "weight", i), {n_embd, laurel_rank}, 0);
  3322. layer.laurel_r = create_tensor(tn(LLM_TENSOR_LAUREL_R, "weight", i), {laurel_rank, n_embd}, 0);
  3323. layer.laurel_post_norm = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM, "weight", i), {n_embd}, 0);
  3324. }
  3325. } break;
  3326. case LLM_ARCH_STARCODER2:
  3327. {
  3328. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3329. // output
  3330. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3331. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3332. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3333. // if output is NULL, init from the input tok embed
  3334. if (output == NULL) {
  3335. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3336. }
  3337. for (int i = 0; i < n_layer; ++i) {
  3338. auto & layer = layers[i];
  3339. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3340. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3341. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3342. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3343. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3344. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3345. // optional bias tensors
  3346. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  3347. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  3348. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  3349. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3350. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3351. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3352. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3353. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3354. // optional bias tensors
  3355. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3356. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}, 0);
  3357. }
  3358. } break;
  3359. case LLM_ARCH_MAMBA:
  3360. {
  3361. const int64_t d_conv = hparams.ssm_d_conv;
  3362. const int64_t d_inner = hparams.ssm_d_inner;
  3363. const int64_t d_state = hparams.ssm_d_state;
  3364. const int64_t dt_rank = hparams.ssm_dt_rank;
  3365. // only an expansion factor of 2 is supported for now
  3366. if (2 * n_embd != d_inner) {
  3367. throw std::runtime_error("only an expansion factor of 2 is supported for now");
  3368. }
  3369. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3370. // output
  3371. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3372. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3373. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3374. if (output == NULL) {
  3375. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3376. }
  3377. for (int i = 0; i < n_layer; ++i) {
  3378. auto & layer = layers[i];
  3379. // norm
  3380. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3381. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
  3382. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
  3383. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
  3384. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
  3385. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
  3386. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
  3387. // no "weight" suffix for these
  3388. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
  3389. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
  3390. // out_proj
  3391. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3392. }
  3393. } break;
  3394. case LLM_ARCH_MAMBA2:
  3395. {
  3396. const int64_t d_conv = hparams.ssm_d_conv;
  3397. const int64_t d_inner = hparams.ssm_d_inner;
  3398. const int64_t d_state = hparams.ssm_d_state;
  3399. const int64_t n_head = hparams.ssm_dt_rank;
  3400. const int64_t n_group = hparams.ssm_n_group;
  3401. const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
  3402. // only an expansion factor of 2 is supported for now
  3403. GGML_ASSERT(2 * n_embd == d_inner);
  3404. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3405. // output
  3406. {
  3407. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3408. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3409. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3410. if (output == NULL) {
  3411. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3412. }
  3413. }
  3414. for (int i = 0; i < n_layer; ++i) {
  3415. auto & layer = layers[i];
  3416. // norm
  3417. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3418. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
  3419. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
  3420. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);
  3421. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
  3422. // no "weight" suffix for these
  3423. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
  3424. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
  3425. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
  3426. // out_proj
  3427. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3428. }
  3429. } break;
  3430. case LLM_ARCH_JAMBA:
  3431. {
  3432. const int64_t d_conv = hparams.ssm_d_conv;
  3433. const int64_t d_inner = hparams.ssm_d_inner;
  3434. const int64_t d_state = hparams.ssm_d_state;
  3435. const int64_t dt_rank = hparams.ssm_dt_rank;
  3436. // only an expansion factor of 2 is supported for now
  3437. GGML_ASSERT(2 * n_embd == d_inner);
  3438. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3439. // output
  3440. {
  3441. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3442. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3443. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3444. if (output == NULL) {
  3445. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3446. }
  3447. }
  3448. for (int i = 0; i < n_layer; ++i) {
  3449. const int64_t n_head_kv = hparams.n_head_kv(i);
  3450. const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
  3451. auto & layer = layers[i];
  3452. // norm
  3453. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3454. if (n_head_kv == 0) {
  3455. // Mamba layer
  3456. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
  3457. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
  3458. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
  3459. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
  3460. layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, "weight", i), {dt_rank}, 0);
  3461. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
  3462. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
  3463. layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, "weight", i), {d_state}, 0);
  3464. layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, "weight", i), {d_state}, 0);
  3465. // no "weight" suffix for these
  3466. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
  3467. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
  3468. // out_proj
  3469. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3470. } else {
  3471. // Attention layers
  3472. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3473. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3474. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3475. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3476. }
  3477. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3478. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
  3479. if (layer.ffn_gate_inp) {
  3480. // MoE
  3481. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3482. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  3483. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3484. } else {
  3485. // FFN (no MoE)
  3486. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3487. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3488. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3489. }
  3490. }
  3491. } break;
  3492. case LLM_ARCH_GRANITE_HYBRID:
  3493. {
  3494. // mamba2 Mixer SSM params
  3495. // NOTE: int64_t for tensor dimensions
  3496. const int64_t d_conv = hparams.ssm_d_conv;
  3497. const int64_t d_inner = hparams.ssm_d_inner;
  3498. const int64_t d_state = hparams.ssm_d_state;
  3499. const int64_t n_ssm_head = hparams.ssm_dt_rank;
  3500. const int64_t n_group = hparams.ssm_n_group;
  3501. const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
  3502. // only an expansion factor of 2 is supported for now
  3503. GGML_ASSERT(2 * n_embd == d_inner);
  3504. // embeddings
  3505. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3506. // output
  3507. {
  3508. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3509. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3510. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3511. if (output == NULL) {
  3512. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3513. }
  3514. }
  3515. for (int i = 0; i < n_layer; ++i) {
  3516. auto & layer = layers[i];
  3517. // norm
  3518. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3519. if (hparams.is_recurrent(i)) {
  3520. // ssm layers
  3521. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
  3522. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
  3523. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
  3524. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
  3525. // no "weight" suffix for these
  3526. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
  3527. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
  3528. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
  3529. // out_proj
  3530. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3531. } else {
  3532. // attention layers (with optional bias)
  3533. const int64_t n_head_i = hparams.n_head(i);
  3534. const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
  3535. const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
  3536. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
  3537. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
  3538. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
  3539. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
  3540. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3541. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
  3542. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
  3543. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3544. }
  3545. // feed forward (w/ optional biases)
  3546. if (n_expert > 0) {
  3547. // MoE FFN
  3548. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3549. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3550. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3551. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
  3552. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  3553. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3554. // For Granite MoE Shared
  3555. if (hparams.n_ff_shexp > 0) {
  3556. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  3557. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  3558. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
  3559. }
  3560. } else {
  3561. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3562. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3563. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3564. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3565. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3566. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  3567. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3568. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  3569. }
  3570. }
  3571. } break;
  3572. case LLM_ARCH_XVERSE:
  3573. {
  3574. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3575. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3576. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3577. for (int i = 0; i < n_layer; ++i) {
  3578. auto & layer = layers[i];
  3579. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3580. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3581. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3582. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3583. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3584. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3585. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3586. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3587. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3588. }
  3589. } break;
  3590. case LLM_ARCH_COMMAND_R:
  3591. {
  3592. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3593. // output
  3594. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3595. // init output from the input tok embed
  3596. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3597. for (int i = 0; i < n_layer; ++i) {
  3598. auto & layer = layers[i];
  3599. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3600. if (n_layer >= 64){
  3601. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
  3602. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
  3603. }
  3604. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3605. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3606. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3607. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3608. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3609. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3610. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3611. }
  3612. } break;
  3613. case LLM_ARCH_COHERE2:
  3614. {
  3615. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  3616. // output
  3617. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  3618. // init output from the input tok embed
  3619. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
  3620. TENSOR_DUPLICATED);
  3621. for (int i = 0; i < n_layer; ++i) {
  3622. auto & layer = layers[i];
  3623. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  3624. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
  3625. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
  3626. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
  3627. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  3628. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
  3629. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  3630. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
  3631. }
  3632. }
  3633. break;
  3634. case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
  3635. {
  3636. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3637. // output
  3638. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3639. // if output is NULL, init from the input tok embed
  3640. if (output == NULL) {
  3641. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3642. }
  3643. for (int i = 0; i < n_layer; ++i) {
  3644. auto & layer = layers[i];
  3645. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3646. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3647. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3648. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3649. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3650. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3651. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3652. }
  3653. } break;
  3654. case LLM_ARCH_OLMO2:
  3655. {
  3656. const int64_t n_embd_head = n_embd / n_head;
  3657. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3658. // output
  3659. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3660. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3661. for (int i = 0; i < n_layer; ++i) {
  3662. auto & layer = layers[i];
  3663. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3664. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3665. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3666. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3667. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
  3668. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
  3669. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3670. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3671. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3672. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3673. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3674. }
  3675. } break;
  3676. case LLM_ARCH_SEED_OSS:
  3677. {
  3678. const uint32_t head_dim = hparams.n_embd_head_k;
  3679. const int64_t n_qo_dim = n_head * head_dim;
  3680. const int64_t n_kv_dim = n_head_kv * head_dim;
  3681. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3682. // output
  3683. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3684. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3685. // if output is NULL, init from the input tok embed
  3686. if (output == NULL) {
  3687. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3688. }
  3689. for (int i = 0; i < n_layer; ++i) {
  3690. auto & layer = layers[i];
  3691. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, 0);
  3692. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, 0);
  3693. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, 0);
  3694. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
  3695. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_qo_dim}, TENSOR_NOT_REQUIRED);
  3696. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
  3697. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
  3698. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3699. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3700. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3701. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3702. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3703. }
  3704. } break;
  3705. case LLM_ARCH_OLMOE:
  3706. {
  3707. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3708. // output
  3709. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3710. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3711. for (int i = 0; i < n_layer; ++i) {
  3712. auto & layer = layers[i];
  3713. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3714. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3715. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3716. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3717. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3718. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
  3719. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
  3720. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3721. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3722. if (n_expert == 0) {
  3723. throw std::runtime_error("n_expert must be > 0");
  3724. }
  3725. if (n_expert_used == 0) {
  3726. throw std::runtime_error("n_expert_used must be > 0");
  3727. }
  3728. // MoE branch
  3729. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3730. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  3731. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3732. }
  3733. } break;
  3734. case LLM_ARCH_OPENELM:
  3735. {
  3736. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3737. // output
  3738. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3739. // init output from the input tok embed
  3740. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3741. for (int i = 0; i < n_layer; ++i) {
  3742. const int64_t n_head = hparams.n_head(i);
  3743. const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head;
  3744. const int64_t n_ff = hparams.n_ff(i);
  3745. auto & layer = layers[i];
  3746. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3747. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
  3748. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  3749. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  3750. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
  3751. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3752. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3753. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3754. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3755. }
  3756. } break;
  3757. case LLM_ARCH_GPTNEOX:
  3758. {
  3759. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3760. // output
  3761. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3762. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3763. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3764. for (int i = 0; i < n_layer; ++i) {
  3765. auto & layer = layers[i];
  3766. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3767. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3768. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3769. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  3770. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3771. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3772. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3773. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3774. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3775. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3776. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3777. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  3778. }
  3779. } break;
  3780. case LLM_ARCH_ARCTIC:
  3781. {
  3782. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3783. // output
  3784. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3785. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3786. // if output is NULL, init from the input tok embed
  3787. if (output == NULL) {
  3788. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3789. }
  3790. for (int i = 0; i < n_layer; ++i) {
  3791. auto & layer = layers[i];
  3792. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3793. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3794. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3795. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3796. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3797. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3798. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
  3799. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
  3800. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd}, 0);
  3801. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3802. layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
  3803. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
  3804. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  3805. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3806. }
  3807. } break;
  3808. case LLM_ARCH_DEEPSEEK:
  3809. {
  3810. const int64_t n_ff_exp = hparams.n_ff_exp;
  3811. const int64_t n_expert_shared = hparams.n_expert_shared;
  3812. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3813. // output
  3814. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3815. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3816. for (int i = 0; i < n_layer; ++i) {
  3817. auto & layer = layers[i];
  3818. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3819. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3820. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3821. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3822. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3823. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3824. if (i < (int) hparams.n_layer_dense_lead) {
  3825. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3826. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3827. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3828. } else {
  3829. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3830. if (n_expert == 0) {
  3831. throw std::runtime_error("n_expert must be > 0");
  3832. }
  3833. if (n_expert_used == 0) {
  3834. throw std::runtime_error("n_expert_used must be > 0");
  3835. }
  3836. // MoE branch
  3837. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3838. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  3839. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3840. // Shared expert branch
  3841. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3842. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  3843. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3844. }
  3845. }
  3846. } break;
  3847. case LLM_ARCH_DEEPSEEK2:
  3848. {
  3849. const bool is_lite = (hparams.n_layer == 27);
  3850. const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
  3851. // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
  3852. const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
  3853. const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
  3854. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  3855. const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
  3856. const int64_t q_lora_rank = hparams.n_lora_q;
  3857. const int64_t kv_lora_rank = hparams.n_lora_kv;
  3858. const int64_t n_ff_exp = hparams.n_ff_exp;
  3859. const int64_t n_expert_shared = hparams.n_expert_shared;
  3860. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3861. // output
  3862. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3863. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3864. for (int i = 0; i < n_layer; ++i) {
  3865. auto & layer = layers[i];
  3866. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3867. if (!is_lite) {
  3868. layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
  3869. }
  3870. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  3871. if (!is_lite) {
  3872. layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
  3873. layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
  3874. } else {
  3875. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
  3876. }
  3877. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
  3878. // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
  3879. if (is_mla) {
  3880. layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
  3881. layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
  3882. } else {
  3883. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
  3884. }
  3885. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
  3886. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3887. if (i < (int) hparams.n_layer_dense_lead) {
  3888. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3889. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3890. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3891. } else {
  3892. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3893. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  3894. if (n_expert == 0) {
  3895. throw std::runtime_error("n_expert must be > 0");
  3896. }
  3897. if (n_expert_used == 0) {
  3898. throw std::runtime_error("n_expert_used must be > 0");
  3899. }
  3900. // MoE branch
  3901. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3902. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  3903. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3904. // Shared expert branch
  3905. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3906. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  3907. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3908. }
  3909. }
  3910. } break;
  3911. case LLM_ARCH_PLM:
  3912. {
  3913. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  3914. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  3915. const int64_t kv_lora_rank = hparams.n_lora_kv;
  3916. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3917. // output
  3918. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3919. // output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3920. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3921. for (int i = 0; i < n_layer; ++i) {
  3922. auto & layer = layers[i];
  3923. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3924. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3925. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
  3926. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  3927. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
  3928. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
  3929. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3930. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3931. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3932. }
  3933. } break;
  3934. case LLM_ARCH_BITNET:
  3935. {
  3936. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3937. // output
  3938. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3939. for (int i = 0; i < n_layer; ++i) {
  3940. auto & layer = layers[i];
  3941. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3942. layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
  3943. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3944. layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3945. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3946. layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3947. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3948. layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3949. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3950. layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3951. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3952. layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
  3953. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3954. layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3955. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3956. layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3957. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3958. layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3959. }
  3960. } break;
  3961. case LLM_ARCH_T5:
  3962. {
  3963. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  3964. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3965. // output
  3966. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3967. output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3968. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3969. // if output is NULL, init from the input tok embed
  3970. if (output == NULL) {
  3971. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3972. }
  3973. // n_layer: number of encoder_layers
  3974. // dec_n_layer: number of decoder_layers
  3975. const int dec_n_layer = hparams.dec_n_layer;
  3976. if (dec_n_layer > n_layer) {
  3977. layers.resize(dec_n_layer);
  3978. }
  3979. // load encoder layers
  3980. for (int i = 0; i < n_layer; ++i) {
  3981. auto & layer = layers[i];
  3982. layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
  3983. layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  3984. layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3985. layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3986. layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3987. layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  3988. layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
  3989. layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  3990. layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3991. layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3992. }
  3993. // load decoder layers
  3994. for (int i = 0; i < dec_n_layer; ++i) {
  3995. auto & layer = layers[i];
  3996. layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
  3997. layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  3998. layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3999. layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4000. layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4001. layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  4002. layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
  4003. // this tensor seems to be unused in HF transformers implementation
  4004. layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  4005. layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4006. layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4007. layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4008. layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  4009. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
  4010. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  4011. layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4012. layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4013. }
  4014. } break;
  4015. case LLM_ARCH_T5ENCODER:
  4016. {
  4017. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  4018. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4019. // output
  4020. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4021. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4022. // if output is NULL, init from the input tok embed
  4023. if (output == NULL) {
  4024. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4025. }
  4026. for (int i = 0; i < n_layer; ++i) {
  4027. auto & layer = layers[i];
  4028. layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
  4029. layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  4030. layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4031. layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4032. layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4033. layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  4034. layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
  4035. layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  4036. layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4037. layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4038. }
  4039. } break;
  4040. case LLM_ARCH_JAIS:
  4041. {
  4042. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4043. // output
  4044. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4045. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4046. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4047. for (int i = 0; i < n_layer; ++i) {
  4048. auto & layer = layers[i];
  4049. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4050. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  4051. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  4052. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  4053. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4054. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  4055. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4056. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  4057. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  4058. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  4059. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4060. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, 0);
  4061. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4062. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  4063. }
  4064. } break;
  4065. case LLM_ARCH_CHATGLM:
  4066. {
  4067. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4068. // output
  4069. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4070. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4071. // if output is NULL, init from the input tok embed
  4072. if (output == NULL) {
  4073. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4074. }
  4075. for (int i = 0; i < n_layer; ++i) {
  4076. auto & layer = layers[i];
  4077. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4078. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4079. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4080. if (layer.wqkv == nullptr) {
  4081. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4082. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4083. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4084. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4085. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4086. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4087. }
  4088. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4089. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4090. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  4091. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  4092. }
  4093. } break;
  4094. case LLM_ARCH_GLM4:
  4095. {
  4096. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4097. // output
  4098. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4099. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4100. // if output is NULL, init from the input tok embed
  4101. if (output == NULL) {
  4102. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4103. }
  4104. for (int i = 0; i < n_layer; ++i) {
  4105. auto & layer = layers[i];
  4106. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4107. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4108. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4109. if (layer.wqkv == nullptr) {
  4110. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4111. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4112. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4113. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4114. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4115. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4116. }
  4117. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4118. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  4119. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4120. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4121. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  4122. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  4123. }
  4124. } break;
  4125. case LLM_ARCH_GLM4_MOE:
  4126. {
  4127. const int64_t n_expert = hparams.n_expert;
  4128. const int64_t n_expert_used = hparams.n_expert_used;
  4129. const int64_t n_expert_shared = hparams.n_expert_shared;
  4130. GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
  4131. GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
  4132. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  4133. // output
  4134. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  4135. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
  4136. // if output is NULL, init from the input tok embed
  4137. if (output == NULL) {
  4138. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
  4139. }
  4140. // Load ALL tensors including NextN layer to satisfy total tensor count
  4141. // but only PROCESS up to last layer (skipping final NextN layer) in forward pass
  4142. for (int i = 0; i < n_layer; ++i) {
  4143. int flags = 0;
  4144. if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
  4145. // skip all tensors in the NextN layers
  4146. flags |= TENSOR_SKIP;
  4147. }
  4148. auto & layer = layers[i];
  4149. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
  4150. // GLM-style attention with bias terms
  4151. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
  4152. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
  4153. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
  4154. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
  4155. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
  4156. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
  4157. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
  4158. // K/Q norm tensors (optional for GLM-4.5 355B variant)
  4159. layer.attn_q_norm = create_tensor(
  4160. tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
  4161. layer.attn_k_norm = create_tensor(
  4162. tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
  4163. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
  4164. // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
  4165. // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
  4166. const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
  4167. if (use_moe) {
  4168. // MoE layers
  4169. layer.ffn_gate_inp =
  4170. create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
  4171. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
  4172. // MoE branch
  4173. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  4174. layer.ffn_gate_exps = create_tensor(
  4175. tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
  4176. layer.ffn_down_exps = create_tensor(
  4177. tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
  4178. layer.ffn_up_exps = create_tensor(
  4179. tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
  4180. // Shared expert
  4181. if (n_expert_shared > 0) {
  4182. const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
  4183. layer.ffn_gate_shexp = create_tensor(
  4184. tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
  4185. layer.ffn_down_shexp = create_tensor(
  4186. tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
  4187. layer.ffn_up_shexp = create_tensor(
  4188. tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
  4189. }
  4190. } else {
  4191. // Dense layers (first k layers) - GLM uses separate gate/up projections
  4192. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
  4193. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
  4194. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags);
  4195. }
  4196. // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
  4197. if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
  4198. layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
  4199. layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags);
  4200. layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
  4201. layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
  4202. layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags);
  4203. layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags);
  4204. }
  4205. }
  4206. }
  4207. break;
  4208. case LLM_ARCH_NEMOTRON:
  4209. {
  4210. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4211. // output
  4212. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4213. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4214. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4215. for (int i = 0; i < n_layer; ++i) {
  4216. auto & layer = layers[i];
  4217. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4218. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  4219. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  4220. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  4221. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  4222. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4223. // optional bias tensors
  4224. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4225. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4226. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4227. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4228. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4229. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  4230. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4231. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4232. // optional MLP bias
  4233. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4234. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  4235. }
  4236. } break;
  4237. case LLM_ARCH_NEMOTRON_H:
  4238. {
  4239. // mamba2 Mixer SSM params
  4240. // NOTE: int64_t for tensor dimensions
  4241. const int64_t d_conv = hparams.ssm_d_conv;
  4242. const int64_t d_inner = hparams.ssm_d_inner;
  4243. const int64_t d_state = hparams.ssm_d_state;
  4244. const int64_t n_ssm_head = hparams.ssm_dt_rank;
  4245. const int64_t n_group = hparams.ssm_n_group;
  4246. const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
  4247. // embeddings
  4248. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4249. // output
  4250. {
  4251. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4252. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4253. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  4254. if (output == NULL) {
  4255. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4256. }
  4257. }
  4258. for (int i = 0; i < n_layer; ++i) {
  4259. auto & layer = layers[i];
  4260. // all blocks use the attn norm
  4261. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4262. if (hparams.is_recurrent(i)) {
  4263. // ssm layers
  4264. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
  4265. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
  4266. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
  4267. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
  4268. // no "weight" suffix for these
  4269. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
  4270. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
  4271. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
  4272. // out_proj
  4273. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  4274. } else if (hparams.n_ff(i) == 0) {
  4275. // attention layers (with optional bias)
  4276. const int64_t n_head_i = hparams.n_head(i);
  4277. const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
  4278. const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
  4279. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
  4280. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
  4281. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
  4282. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
  4283. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4284. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
  4285. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
  4286. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4287. } else {
  4288. // mlp layers
  4289. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
  4290. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
  4291. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4292. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
  4293. }
  4294. }
  4295. } break;
  4296. case LLM_ARCH_EXAONE:
  4297. {
  4298. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4299. // output
  4300. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4301. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4302. // if output is NULL, init from the input tok embed
  4303. if (output == NULL) {
  4304. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4305. }
  4306. for (int i = 0; i < n_layer; ++i) {
  4307. auto & layer = layers[i];
  4308. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4309. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4310. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4311. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4312. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4313. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4314. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4315. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4316. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4317. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4318. }
  4319. } break;
  4320. case LLM_ARCH_EXAONE4:
  4321. {
  4322. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4323. // output
  4324. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4325. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4326. // if output is NULL, init from the input tok embed
  4327. if (output == NULL) {
  4328. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4329. }
  4330. for (int i = 0; i < n_layer; ++i) {
  4331. auto & layer = layers[i];
  4332. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4333. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4334. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4335. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4336. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4337. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  4338. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4339. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4340. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4341. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4342. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4343. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  4344. }
  4345. } break;
  4346. case LLM_ARCH_RWKV6:
  4347. {
  4348. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4349. // Block 0, LN0
  4350. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  4351. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  4352. // output
  4353. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4354. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4355. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4356. const int time_mix_extra_dim = hparams.time_mix_extra_dim;
  4357. const int time_decay_extra_dim = hparams.time_decay_extra_dim;
  4358. const int head_size = hparams.wkv_head_size;
  4359. const int attn_hidden_size = n_embd;
  4360. const int ffn_size = hparams.n_ff_arr[0];
  4361. for (int i = 0; i < n_layer; ++i) {
  4362. auto & layer = layers[i];
  4363. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4364. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  4365. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
  4366. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
  4367. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
  4368. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
  4369. layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
  4370. layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4371. layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4372. layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4373. layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4374. layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4375. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED);
  4376. GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
  4377. layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
  4378. layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
  4379. layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
  4380. layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
  4381. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  4382. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4383. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4384. layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4385. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
  4386. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
  4387. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4388. layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
  4389. layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
  4390. layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
  4391. layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
  4392. layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
  4393. }
  4394. } break;
  4395. case LLM_ARCH_RWKV6QWEN2:
  4396. {
  4397. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4398. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4399. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  4400. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4401. const int time_mix_extra_dim = hparams.time_mix_extra_dim;
  4402. const int time_decay_extra_dim = hparams.time_decay_extra_dim;
  4403. const int head_size = hparams.wkv_head_size;
  4404. const int attn_hidden_size = n_embd;
  4405. const int n_head_kv = hparams.n_head_kv();
  4406. int attn_key_value_size;
  4407. if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
  4408. attn_key_value_size = attn_hidden_size;
  4409. } else {
  4410. attn_key_value_size = n_head_kv * head_size;
  4411. }
  4412. for (int i = 0; i < n_layer; ++i) {
  4413. auto & layer = layers[i];
  4414. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4415. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
  4416. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
  4417. layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
  4418. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
  4419. layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, TENSOR_NOT_REQUIRED);
  4420. layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
  4421. layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
  4422. layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
  4423. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
  4424. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
  4425. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4426. layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4427. // optional bias tensors
  4428. layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
  4429. layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
  4430. layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, TENSOR_NOT_REQUIRED);
  4431. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4432. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4433. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4434. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4435. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4436. }
  4437. } break;
  4438. case LLM_ARCH_RWKV7:
  4439. {
  4440. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4441. // Block 0, LN0
  4442. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  4443. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  4444. // output
  4445. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4446. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4447. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4448. const int n_lora_decay = hparams.n_lora_decay;
  4449. const int n_lora_iclr = hparams.n_lora_iclr;
  4450. const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
  4451. const int n_lora_gate = hparams.n_lora_gate;
  4452. const int attn_hidden_size = n_embd;
  4453. const int ffn_size = hparams.n_ff_arr[0];
  4454. for (int i = 0; i < n_layer; ++i) {
  4455. auto & layer = layers[i];
  4456. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4457. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  4458. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
  4459. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
  4460. layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
  4461. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
  4462. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
  4463. layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
  4464. layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4465. layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4466. if (i == 0) {
  4467. // actually not used
  4468. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4469. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4470. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4471. } else {
  4472. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4473. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
  4474. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
  4475. }
  4476. layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, 0);
  4477. layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, 0);
  4478. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
  4479. layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
  4480. layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
  4481. layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
  4482. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  4483. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4484. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4485. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
  4486. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
  4487. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4488. layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
  4489. layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
  4490. layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
  4491. }
  4492. } break;
  4493. case LLM_ARCH_ARWKV7:
  4494. {
  4495. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4496. // output
  4497. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4498. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4499. const int n_lora_decay = hparams.n_lora_decay;
  4500. const int n_lora_iclr = hparams.n_lora_iclr;
  4501. const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
  4502. const int n_lora_gate = hparams.n_lora_gate;
  4503. const int attn_hidden_size = n_embd;
  4504. for (int i = 0; i < n_layer; ++i) {
  4505. auto & layer = layers[i];
  4506. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4507. layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
  4508. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
  4509. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
  4510. layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
  4511. layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4512. layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4513. if (i == 0) {
  4514. // actually not used
  4515. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4516. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4517. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4518. } else {
  4519. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4520. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
  4521. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
  4522. }
  4523. layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, TENSOR_NOT_REQUIRED);
  4524. layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, TENSOR_NOT_REQUIRED);
  4525. try {
  4526. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
  4527. } catch(std::runtime_error & e) {
  4528. // ARWKV models may not have gate tensors
  4529. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
  4530. }
  4531. layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
  4532. layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
  4533. layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
  4534. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  4535. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4536. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4537. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4538. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4539. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4540. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4541. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4542. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4543. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4544. }
  4545. } break;
  4546. case LLM_ARCH_CHAMELEON:
  4547. {
  4548. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4549. // output
  4550. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4551. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4552. // if output is NULL, init from the input tok embed
  4553. if (output == NULL) {
  4554. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4555. }
  4556. for (int i = 0; i < n_layer; ++i) {
  4557. auto & layer = layers[i];
  4558. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4559. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
  4560. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
  4561. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
  4562. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
  4563. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  4564. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  4565. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  4566. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4567. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4568. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4569. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4570. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4571. }
  4572. } break;
  4573. case LLM_ARCH_WAVTOKENIZER_DEC:
  4574. {
  4575. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
  4576. conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
  4577. conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
  4578. // posnet
  4579. {
  4580. const int64_t n_embd = hparams.posnet.n_embd;
  4581. for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
  4582. auto & layer = layers[i].posnet;
  4583. // posnet:
  4584. //
  4585. // - resnet
  4586. // - resnet
  4587. // - attn
  4588. // - resnet
  4589. // - resnet
  4590. // - norm
  4591. //
  4592. switch (i) {
  4593. case 0:
  4594. case 1:
  4595. case 3:
  4596. case 4:
  4597. {
  4598. layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
  4599. layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
  4600. layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
  4601. layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
  4602. layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
  4603. layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
  4604. layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
  4605. layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
  4606. } break;
  4607. case 2:
  4608. {
  4609. layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
  4610. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
  4611. layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
  4612. layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
  4613. layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
  4614. layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
  4615. layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
  4616. layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
  4617. layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
  4618. layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
  4619. } break;
  4620. case 5:
  4621. {
  4622. layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
  4623. layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
  4624. } break;
  4625. default: GGML_ABORT("unknown posnet layer");
  4626. };
  4627. }
  4628. }
  4629. GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
  4630. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
  4631. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
  4632. // convnext
  4633. {
  4634. const int64_t n_embd = hparams.convnext.n_embd;
  4635. for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
  4636. auto & layer = layers[i].convnext;
  4637. layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
  4638. layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
  4639. layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
  4640. layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
  4641. layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
  4642. layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
  4643. layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
  4644. layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
  4645. layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
  4646. }
  4647. // output
  4648. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4649. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4650. }
  4651. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
  4652. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
  4653. } break;
  4654. case LLM_ARCH_BAILINGMOE:
  4655. {
  4656. const int64_t n_ff_exp = hparams.n_ff_exp;
  4657. const int64_t n_expert_shared = hparams.n_expert_shared;
  4658. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4659. // output
  4660. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4661. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4662. for (int i = 0; i < n_layer; ++i) {
  4663. auto & layer = layers[i];
  4664. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4665. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
  4666. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  4667. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  4668. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
  4669. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4670. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4671. if (n_expert == 0) {
  4672. throw std::runtime_error("n_expert must be > 0");
  4673. }
  4674. if (n_expert_used == 0) {
  4675. throw std::runtime_error("n_expert_used must be > 0");
  4676. }
  4677. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4678. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  4679. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4680. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4681. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  4682. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4683. }
  4684. } break;
  4685. case LLM_ARCH_DOTS1:
  4686. {
  4687. const int64_t n_ff_exp = hparams.n_ff_exp;
  4688. const int64_t n_expert_shared = hparams.n_expert_shared;
  4689. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4690. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4691. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4692. for (int i = 0; i < n_layer; ++i) {
  4693. auto & layer = layers[i];
  4694. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4695. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4696. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4697. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4698. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4699. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4700. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4701. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4702. if (i < (int) hparams.n_layer_dense_lead) {
  4703. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4704. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4705. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4706. } else {
  4707. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4708. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  4709. if (n_expert == 0) {
  4710. throw std::runtime_error("n_expert must be > 0");
  4711. }
  4712. if (n_expert_used == 0) {
  4713. throw std::runtime_error("n_expert_used must be > 0");
  4714. }
  4715. // MoE branch
  4716. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4717. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  4718. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4719. // Shared expert branch
  4720. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4721. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  4722. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4723. }
  4724. }
  4725. } break;
  4726. case LLM_ARCH_ARCEE:
  4727. {
  4728. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4729. // output
  4730. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4731. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4732. // if output is NULL, init from the input tok embed
  4733. if (output == NULL) {
  4734. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4735. }
  4736. for (int i = 0; i < n_layer; ++i) {
  4737. auto & layer = layers[i];
  4738. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4739. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4740. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4741. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4742. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4743. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4744. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4745. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4746. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4747. }
  4748. } break;
  4749. case LLM_ARCH_ERNIE4_5:
  4750. case LLM_ARCH_ERNIE4_5_MOE:
  4751. {
  4752. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4753. // output
  4754. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4755. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4756. // if output is NULL, init from the input tok embed
  4757. if (output == NULL) {
  4758. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4759. }
  4760. for (int i = 0; i < n_layer; ++i) {
  4761. auto & layer = layers[i];
  4762. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4763. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4764. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  4765. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  4766. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4767. // optional bias tensors
  4768. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4769. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4770. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4771. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4772. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4773. if (arch == LLM_ARCH_ERNIE4_5_MOE && static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
  4774. int n_ff_exp = hparams.n_ff_exp;
  4775. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4776. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  4777. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
  4778. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
  4779. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  4780. // Shared expert (if present)
  4781. if (hparams.n_ff_shexp > 0) {
  4782. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
  4783. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd }, 0);
  4784. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
  4785. }
  4786. } else { // Dense layers
  4787. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4788. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4789. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4790. }
  4791. }
  4792. } break;
  4793. case LLM_ARCH_FALCON_H1:
  4794. {
  4795. // Common
  4796. const int64_t hidden_size = hparams.n_embd; // hidden_size
  4797. // mamba2 Mixer SSM params
  4798. const int64_t ssm_conv_kernel_size = hparams.ssm_d_conv; // ssm_conv_kernel_size
  4799. const int64_t ssm_n_groups = hparams.ssm_n_group; // ssm_n_groups
  4800. const int64_t ssm_state_size = hparams.ssm_d_state; // ssm_state_size
  4801. const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand
  4802. const int64_t ssm_num_heads = hparams.ssm_dt_rank; // ssm_num_heads
  4803. const int64_t ssm_conv_dim = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size;
  4804. const int64_t ssm_projection_size = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads;
  4805. // attn params
  4806. const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head
  4807. const int64_t attn_num_key_value_head = hparams.n_head_kv(0);
  4808. // ffn params
  4809. const int64_t ffn_intermediate_size = hparams.n_ff(0);
  4810. // embeddings
  4811. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0);
  4812. // output
  4813. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED);
  4814. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0);
  4815. // if output is NULL, init from the input tok embed
  4816. if (output == NULL) {
  4817. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED);
  4818. }
  4819. for (int i = 0; i < n_layer; ++i) {
  4820. auto & layer = layers[i];
  4821. /*SSM LAYERS*/
  4822. // ssm in
  4823. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0);
  4824. // ssm 1d conv
  4825. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0);
  4826. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED);
  4827. // ssm_dt
  4828. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0);
  4829. // no "weight" suffix for these
  4830. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0);
  4831. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0);
  4832. // ssm_norm
  4833. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED);
  4834. // out_proj
  4835. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0);
  4836. /*ATTENTION LAYERS*/
  4837. // attention layers (with optional bias)
  4838. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {hidden_size, n_embd_head_k * attn_num_attention_head}, 0);
  4839. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0);
  4840. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0);
  4841. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
  4842. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
  4843. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {attn_num_key_value_head * n_embd_head_k}, TENSOR_NOT_REQUIRED);
  4844. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {attn_num_key_value_head * n_embd_head_v}, TENSOR_NOT_REQUIRED);
  4845. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
  4846. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
  4847. // feed forward (w/ optional biases)
  4848. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0);
  4849. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4850. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
  4851. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_intermediate_size, hidden_size}, 0);
  4852. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
  4853. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
  4854. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
  4855. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
  4856. }
  4857. } break;
  4858. case LLM_ARCH_HUNYUAN_MOE:
  4859. {
  4860. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4861. // output
  4862. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4863. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4864. // if output is NULL, init from the input tok embed
  4865. if (output == NULL) {
  4866. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4867. }
  4868. for (int i = 0; i < n_layer; ++i) {
  4869. auto & layer = layers[i];
  4870. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4871. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4872. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4873. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4874. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4875. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4876. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4877. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4878. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4879. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  4880. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  4881. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  4882. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  4883. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  4884. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
  4885. }
  4886. } break;
  4887. case LLM_ARCH_HUNYUAN_DENSE:
  4888. {
  4889. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4890. // output
  4891. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4892. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4893. // if output is NULL, init from the input tok embed
  4894. if (output == NULL) {
  4895. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4896. }
  4897. for (int i = 0; i < n_layer; ++i) {
  4898. auto & layer = layers[i];
  4899. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4900. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4901. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4902. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4903. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4904. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4905. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4906. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4907. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4908. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4909. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4910. }
  4911. } break;
  4912. case LLM_ARCH_SMOLLM3:
  4913. {
  4914. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4915. // output
  4916. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4917. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4918. // if output is NULL, init from the input tok embed
  4919. if (output == NULL) {
  4920. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4921. }
  4922. for (int i = 0; i < n_layer; ++i) {
  4923. auto & layer = layers[i];
  4924. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4925. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4926. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4927. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4928. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4929. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4930. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4931. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4932. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4933. }
  4934. } break;
  4935. case LLM_ARCH_OPENAI_MOE:
  4936. {
  4937. const int64_t n_ff_exp = hparams.n_ff_exp;
  4938. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4939. // output
  4940. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4941. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4942. for (int i = 0; i < n_layer; ++i) {
  4943. auto & layer = layers[i];
  4944. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4945. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  4946. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
  4947. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  4948. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  4949. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
  4950. layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
  4951. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
  4952. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4953. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  4954. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4955. // bias
  4956. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_head * n_rot}, 0);
  4957. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_head_kv * n_rot}, 0);
  4958. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_head_kv * n_rot}, 0);
  4959. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  4960. layer.ffn_gate_inp_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0);
  4961. layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
  4962. layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), { n_embd, n_expert}, 0);
  4963. layer.ffn_up_exps_b = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
  4964. }
  4965. } break;
  4966. case LLM_ARCH_LFM2:
  4967. {
  4968. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4969. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  4970. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4971. if (output == NULL) {
  4972. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4973. }
  4974. for (int i = 0; i < n_layer; ++i) {
  4975. auto & layer = layers[i];
  4976. // ffn is same for transformer and conv layers
  4977. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4978. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4979. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4980. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4981. // for operator_norm
  4982. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4983. if (!hparams.is_recurrent(i)) {
  4984. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4985. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4986. GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
  4987. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  4988. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, hparams.n_embd_k_gqa(i)}, 0);
  4989. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, hparams.n_embd_v_gqa(i)}, 0);
  4990. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4991. } else {
  4992. layer.shortconv.conv = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV, "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0);
  4993. layer.shortconv.in_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ, "weight", i), {n_embd, 3 * n_embd}, 0);
  4994. layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
  4995. }
  4996. }
  4997. } break;
  4998. case LLM_ARCH_SMALLTHINKER:
  4999. {
  5000. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  5001. // output
  5002. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  5003. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  5004. // if output is NULL, init from the input tok embed
  5005. if (output == NULL) {
  5006. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  5007. }
  5008. for (int i = 0; i < n_layer; ++i) {
  5009. auto & layer = layers[i];
  5010. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  5011. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
  5012. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
  5013. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
  5014. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
  5015. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  5016. GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for SMALLTHINKER");
  5017. GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for SMALLTHINKER");
  5018. // MoE branch
  5019. const int64_t n_ff_exp = hparams.n_ff_exp;
  5020. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
  5021. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
  5022. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
  5023. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
  5024. }
  5025. } break;
  5026. default:
  5027. throw std::runtime_error("unknown architecture");
  5028. }
  5029. if (n_moved_tensors > 0) {
  5030. LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
  5031. __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
  5032. ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
  5033. }
  5034. }
  5035. ml.done_getting_tensors();
  5036. ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
  5037. pimpl->mappings.reserve(ml.mappings.size());
  5038. // create the backend buffers
  5039. std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
  5040. ctx_bufs.reserve(ctx_map.size());
  5041. // Ensure we have enough capacity for the maximum backend buffer we will potentially create
  5042. const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
  5043. pimpl->bufs.reserve(n_max_backend_buffer);
  5044. for (auto & it : ctx_map) {
  5045. ggml_backend_buffer_type_t buft = it.first;
  5046. ggml_context * ctx = it.second;
  5047. // skip contexts without tensors
  5048. if (ggml_get_first_tensor(ctx) == nullptr) {
  5049. continue;
  5050. }
  5051. llama_buf_map buf_map;
  5052. buf_map.reserve(n_max_backend_buffer);
  5053. // check if it is possible to use buffer_from_host_ptr with this buffer type
  5054. ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
  5055. if (!dev) {
  5056. // FIXME: workaround for CPU backend buft having a NULL device
  5057. dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  5058. if (!dev) {
  5059. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  5060. }
  5061. }
  5062. ggml_backend_dev_props props;
  5063. ggml_backend_dev_get_props(dev, &props);
  5064. bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
  5065. bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
  5066. if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
  5067. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  5068. // only the mmap region containing the tensors in the model is mapped to the backend buffer
  5069. // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
  5070. // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
  5071. void * addr = nullptr;
  5072. size_t first, last; // NOLINT
  5073. ml.get_mapping_range(&first, &last, &addr, idx, ctx);
  5074. if (first >= last) {
  5075. continue;
  5076. }
  5077. const size_t max_size = ggml_get_max_tensor_size(ctx);
  5078. ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
  5079. if (buf == nullptr) {
  5080. throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
  5081. }
  5082. pimpl->bufs.emplace_back(buf);
  5083. buf_map.emplace(idx, buf);
  5084. }
  5085. }
  5086. else {
  5087. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
  5088. if (buf == nullptr) {
  5089. throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
  5090. }
  5091. pimpl->bufs.emplace_back(buf);
  5092. if (use_mlock && ggml_backend_buffer_is_host(buf)) {
  5093. pimpl->mlock_bufs.emplace_back(new llama_mlock);
  5094. auto & mlock_buf = pimpl->mlock_bufs.back();
  5095. mlock_buf->init (ggml_backend_buffer_get_base(buf));
  5096. mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
  5097. }
  5098. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  5099. buf_map.emplace(idx, buf);
  5100. }
  5101. }
  5102. if (pimpl->bufs.empty()) {
  5103. throw std::runtime_error("failed to allocate buffer");
  5104. }
  5105. for (auto & buf : buf_map) {
  5106. // indicate that this buffer contains weights
  5107. // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
  5108. ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
  5109. }
  5110. ctx_bufs.emplace_back(ctx, buf_map);
  5111. }
  5112. if (llama_supports_gpu_offload()) {
  5113. const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
  5114. LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
  5115. if (n_gpu_layers > (int) hparams.n_layer) {
  5116. LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
  5117. }
  5118. const int max_backend_supported_layers = hparams.n_layer + 1;
  5119. const int max_offloadable_layers = hparams.n_layer + 1;
  5120. LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
  5121. }
  5122. // print memory requirements per buffer type
  5123. for (auto & buf : pimpl->bufs) {
  5124. LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
  5125. }
  5126. // populate tensors_by_name
  5127. for (auto & ctx : pimpl->ctxs) {
  5128. for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
  5129. tensors_by_name.emplace_back(ggml_get_name(cur), cur);
  5130. }
  5131. }
  5132. // load tensor data
  5133. for (auto & it : ctx_bufs) {
  5134. ggml_context * ctx = it.first;
  5135. auto & bufs = it.second;
  5136. if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
  5137. return false;
  5138. }
  5139. }
  5140. if (use_mmap_buffer) {
  5141. for (auto & mapping : ml.mappings) {
  5142. pimpl->mappings.emplace_back(std::move(mapping));
  5143. }
  5144. }
  5145. return true;
  5146. }
  5147. std::string llama_model::arch_name() const {
  5148. return llm_arch_name(arch);
  5149. }
  5150. std::string llama_model::type_name() const {
  5151. return llm_type_name(type);
  5152. }
  5153. std::string llama_model::desc() const {
  5154. return pimpl->desc_str;
  5155. }
  5156. size_t llama_model::size() const {
  5157. return pimpl->n_bytes;
  5158. }
  5159. size_t llama_model::n_tensors() const {
  5160. return tensors_by_name.size();
  5161. }
  5162. size_t llama_model::n_devices() const {
  5163. return devices.size();
  5164. }
  5165. uint64_t llama_model::n_elements() const {
  5166. return pimpl->n_elements;
  5167. }
  5168. void llama_model::print_info() const {
  5169. const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
  5170. auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
  5171. bool is_var = false;
  5172. std::vector<uint32_t> v;
  5173. for (uint32_t i = 0; i < n; ++i) {
  5174. v.push_back(f(i));
  5175. if (v[i] != v[0]) {
  5176. is_var = true;
  5177. }
  5178. }
  5179. std::stringstream ss;
  5180. if (is_var) {
  5181. ss << "[";
  5182. for (uint32_t i = 0; i < n; ++i) {
  5183. ss << v[i];
  5184. if (i < n - 1) {
  5185. ss << ", ";
  5186. }
  5187. }
  5188. ss << "]";
  5189. } else {
  5190. ss << v[0];
  5191. }
  5192. return ss.str();
  5193. };
  5194. // hparams
  5195. LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
  5196. LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
  5197. if (!hparams.vocab_only) {
  5198. LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
  5199. LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
  5200. LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
  5201. LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
  5202. LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
  5203. LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
  5204. LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
  5205. LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
  5206. LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
  5207. LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
  5208. LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
  5209. LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
  5210. LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
  5211. LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
  5212. LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
  5213. LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
  5214. LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
  5215. LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
  5216. LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
  5217. LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
  5218. LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
  5219. LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
  5220. LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
  5221. LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
  5222. LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
  5223. LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
  5224. LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
  5225. LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
  5226. LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
  5227. LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
  5228. if (!classifier_labels.empty()) {
  5229. LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
  5230. size_t i = 0;
  5231. for (auto label : classifier_labels) {
  5232. LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
  5233. }
  5234. }
  5235. }
  5236. if (arch == LLM_ARCH_MAMBA ||
  5237. arch == LLM_ARCH_MAMBA2 ||
  5238. arch == LLM_ARCH_JAMBA ||
  5239. arch == LLM_ARCH_FALCON_H1 ||
  5240. arch == LLM_ARCH_PLAMO2 ||
  5241. arch == LLM_ARCH_GRANITE_HYBRID ||
  5242. arch == LLM_ARCH_NEMOTRON_H ||
  5243. arch == LLM_ARCH_QWEN3NEXT) {
  5244. LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
  5245. LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
  5246. LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
  5247. LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
  5248. LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
  5249. LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
  5250. }
  5251. LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
  5252. if (pimpl->n_elements >= 1e12) {
  5253. LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
  5254. } else if (pimpl->n_elements >= 1e9) {
  5255. LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
  5256. } else if (pimpl->n_elements >= 1e6) {
  5257. LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
  5258. } else {
  5259. LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
  5260. }
  5261. // general kv
  5262. LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
  5263. if (arch == LLM_ARCH_DEEPSEEK) {
  5264. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  5265. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5266. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  5267. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  5268. }
  5269. if (arch == LLM_ARCH_DEEPSEEK2) {
  5270. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  5271. LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
  5272. LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
  5273. LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
  5274. LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
  5275. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5276. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  5277. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  5278. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  5279. LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
  5280. LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
  5281. }
  5282. if (arch == LLM_ARCH_QWEN2MOE) {
  5283. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5284. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  5285. }
  5286. if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE) {
  5287. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5288. }
  5289. if (arch == LLM_ARCH_MINICPM ||
  5290. arch == LLM_ARCH_GRANITE ||
  5291. arch == LLM_ARCH_GRANITE_MOE ||
  5292. arch == LLM_ARCH_GRANITE_HYBRID) {
  5293. LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
  5294. LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
  5295. LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
  5296. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  5297. }
  5298. if (arch == LLM_ARCH_BAILINGMOE) {
  5299. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  5300. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5301. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  5302. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  5303. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  5304. }
  5305. if (arch == LLM_ARCH_SMALLTHINKER) {
  5306. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5307. LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
  5308. }
  5309. vocab.print_info();
  5310. }
  5311. ggml_backend_dev_t llama_model::dev_layer(int il) const {
  5312. return pimpl->dev_layer.at(il).dev;
  5313. }
  5314. ggml_backend_dev_t llama_model::dev_output() const {
  5315. return pimpl->dev_output.dev;
  5316. }
  5317. template<typename F>
  5318. static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
  5319. ggml_init_params params = {
  5320. /*.mem_size =*/ ggml_tensor_overhead()*8,
  5321. /*.mem_buffer =*/ NULL,
  5322. /*.no_alloc =*/ true,
  5323. };
  5324. ggml_context_ptr ctx { ggml_init(params) };
  5325. if (!ctx) {
  5326. throw std::runtime_error(format("failed to create ggml context"));
  5327. }
  5328. ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
  5329. ggml_tensor * op_tensor = fn(ctx.get());
  5330. for (int i = 0; i < GGML_MAX_SRC; i++) {
  5331. if (op_tensor->src[i] != nullptr) {
  5332. assert(op_tensor->src[i]->buffer == nullptr);
  5333. op_tensor->src[i]->buffer = buf.get();
  5334. }
  5335. }
  5336. bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  5337. return op_supported;
  5338. }
  5339. template<typename F>
  5340. static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
  5341. for (const auto & cur : buft_list) {
  5342. ggml_backend_dev_t cur_dev = cur.first;
  5343. ggml_backend_buffer_type_t cur_buft = cur.second;
  5344. if (buft_supported(cur_buft, cur_dev, fn)) {
  5345. return cur_buft;
  5346. }
  5347. }
  5348. throw std::runtime_error(format("no suitable buffer type found"));
  5349. }
  5350. ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
  5351. return ::select_buft(
  5352. *pimpl->dev_layer.at(il).buft_list,
  5353. [&](ggml_context * ctx) {
  5354. ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  5355. ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  5356. return ggml_add(ctx, cur, layer_dir);
  5357. });
  5358. }
  5359. bool llama_model::has_tensor_overrides() const {
  5360. return pimpl->has_tensor_overrides;
  5361. }
  5362. const ggml_tensor * llama_model::get_tensor(const char * name) const {
  5363. auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
  5364. [name](const std::pair<std::string, ggml_tensor *> & it) {
  5365. return it.first == name;
  5366. });
  5367. if (it == tensors_by_name.end()) {
  5368. return nullptr;
  5369. }
  5370. return it->second;
  5371. }
  5372. float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
  5373. return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
  5374. }
  5375. float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
  5376. return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
  5377. }
  5378. ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
  5379. const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
  5380. // choose long/short freq factors based on the context size
  5381. if (layers[il].rope_freqs != nullptr) {
  5382. return layers[il].rope_freqs;
  5383. }
  5384. if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
  5385. return layers[il].rope_long;
  5386. }
  5387. return layers[il].rope_short;
  5388. }
  5389. llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
  5390. llama_memory_i * res;
  5391. switch (arch) {
  5392. // Models that need specific instantiation should be handled in the
  5393. // switch statement
  5394. case LLM_ARCH_BERT:
  5395. case LLM_ARCH_JINA_BERT_V2:
  5396. case LLM_ARCH_JINA_BERT_V3:
  5397. case LLM_ARCH_NOMIC_BERT:
  5398. case LLM_ARCH_NOMIC_BERT_MOE:
  5399. case LLM_ARCH_NEO_BERT:
  5400. case LLM_ARCH_WAVTOKENIZER_DEC:
  5401. //case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA]
  5402. case LLM_ARCH_DREAM:
  5403. case LLM_ARCH_LLADA:
  5404. case LLM_ARCH_LLADA_MOE:
  5405. {
  5406. res = nullptr;
  5407. } break;
  5408. // Models that need standard caching should rely on recurrent/hybrid
  5409. // checks
  5410. default:
  5411. {
  5412. if (llm_arch_is_recurrent(arch)) {
  5413. res = new llama_memory_recurrent(
  5414. *this,
  5415. GGML_TYPE_F32,
  5416. GGML_TYPE_F32,
  5417. cparams.offload_kqv,
  5418. std::max((uint32_t) 1, cparams.n_seq_max),
  5419. cparams.n_seq_max,
  5420. nullptr);
  5421. } else if (llm_arch_is_hybrid(arch)) {
  5422. // The main difference between hybrid architectures is the
  5423. // layer filters, so pick the right one here
  5424. llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
  5425. llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
  5426. if (arch == LLM_ARCH_FALCON_H1) {
  5427. filter_attn = [&](int32_t) { return true; };
  5428. filter_recr = [&](int32_t) { return true; };
  5429. } else if (arch == LLM_ARCH_NEMOTRON_H) {
  5430. filter_attn = [&](int32_t il) {
  5431. return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
  5432. };
  5433. filter_recr = [&](int32_t il) {
  5434. return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
  5435. };
  5436. }
  5437. const auto padding = llama_kv_cache::get_padding(cparams);
  5438. cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
  5439. res = new llama_memory_hybrid(
  5440. /* model */ *this,
  5441. /* attn_type_k */ params.type_k,
  5442. /* attn_type_v */ params.type_v,
  5443. /* attn_v_trans */ !cparams.flash_attn,
  5444. /* attn_kv_size */ cparams.n_ctx,
  5445. /* attn_n_pad */ padding,
  5446. /* attn_n_swa */ hparams.n_swa,
  5447. /* attn_swa_type */ hparams.swa_type,
  5448. /* recurrent_type_k */ GGML_TYPE_F32,
  5449. /* recurrent_type_v */ GGML_TYPE_F32,
  5450. /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
  5451. /* n_seq_max */ cparams.n_seq_max,
  5452. /* offload */ cparams.offload_kqv,
  5453. /* unified */ cparams.kv_unified,
  5454. /* filter_attn */ std::move(filter_attn),
  5455. /* filter_recr */ std::move(filter_recr));
  5456. } else {
  5457. const auto padding = llama_kv_cache::get_padding(cparams);
  5458. uint32_t n_ctx_per_stream = cparams.n_ctx;
  5459. if (!cparams.kv_unified) {
  5460. n_ctx_per_stream = (cparams.n_ctx + cparams.n_seq_max - 1)/cparams.n_seq_max;
  5461. n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
  5462. cparams.n_ctx = n_ctx_per_stream*cparams.n_seq_max;
  5463. } else {
  5464. n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
  5465. cparams.n_ctx = n_ctx_per_stream;
  5466. }
  5467. LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
  5468. llama_memory_i::layer_reuse_cb reuse = nullptr;
  5469. if (arch == LLM_ARCH_GEMMA3N) {
  5470. reuse = [&](int32_t il) {
  5471. if (il >= (int32_t) hparams.n_layer_kv_from_start) {
  5472. return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
  5473. }
  5474. return -1;
  5475. };
  5476. }
  5477. if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
  5478. GGML_ASSERT(hparams.is_swa_any());
  5479. res = new llama_kv_cache_iswa(
  5480. *this,
  5481. params.type_k,
  5482. params.type_v,
  5483. !cparams.flash_attn,
  5484. cparams.offload_kqv,
  5485. params.swa_full,
  5486. cparams.kv_unified,
  5487. n_ctx_per_stream,
  5488. cparams.n_seq_max,
  5489. cparams.n_ubatch,
  5490. padding,
  5491. nullptr,
  5492. reuse);
  5493. } else {
  5494. GGML_ASSERT(!hparams.is_swa_any());
  5495. res = new llama_kv_cache(
  5496. *this,
  5497. params.type_k,
  5498. params.type_v,
  5499. !cparams.flash_attn,
  5500. cparams.offload_kqv,
  5501. cparams.kv_unified,
  5502. n_ctx_per_stream,
  5503. cparams.n_seq_max,
  5504. padding,
  5505. hparams.n_swa,
  5506. hparams.swa_type,
  5507. nullptr,
  5508. nullptr);
  5509. }
  5510. }
  5511. }
  5512. }
  5513. return res;
  5514. }
  5515. ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
  5516. std::unique_ptr<llm_graph_context> llm;
  5517. switch (arch) {
  5518. case LLM_ARCH_LLAMA:
  5519. {
  5520. llm = std::make_unique<llm_build_llama>(*this, params);
  5521. } break;
  5522. case LLM_ARCH_LLAMA4:
  5523. {
  5524. if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
  5525. llm = std::make_unique<llm_build_llama>(*this, params);
  5526. } else {
  5527. llm = std::make_unique<llm_build_llama_iswa>(*this, params);
  5528. }
  5529. } break;
  5530. case LLM_ARCH_DECI:
  5531. {
  5532. llm = std::make_unique<llm_build_deci>(*this, params);
  5533. } break;
  5534. case LLM_ARCH_BAICHUAN:
  5535. {
  5536. llm = std::make_unique<llm_build_baichuan>(*this, params);
  5537. } break;
  5538. case LLM_ARCH_FALCON:
  5539. {
  5540. llm = std::make_unique<llm_build_falcon>(*this, params);
  5541. } break;
  5542. case LLM_ARCH_GROK:
  5543. {
  5544. llm = std::make_unique<llm_build_grok>(*this, params);
  5545. } break;
  5546. case LLM_ARCH_STARCODER:
  5547. {
  5548. llm = std::make_unique<llm_build_starcoder>(*this, params);
  5549. } break;
  5550. case LLM_ARCH_REFACT:
  5551. {
  5552. llm = std::make_unique<llm_build_refact>(*this, params);
  5553. } break;
  5554. case LLM_ARCH_BERT:
  5555. case LLM_ARCH_JINA_BERT_V2:
  5556. case LLM_ARCH_JINA_BERT_V3:
  5557. case LLM_ARCH_NOMIC_BERT:
  5558. case LLM_ARCH_NOMIC_BERT_MOE:
  5559. {
  5560. llm = std::make_unique<llm_build_bert>(*this, params);
  5561. } break;
  5562. case LLM_ARCH_NEO_BERT:
  5563. {
  5564. llm = std::make_unique<llm_build_neo_bert>(*this, params);
  5565. } break;
  5566. case LLM_ARCH_BLOOM:
  5567. {
  5568. llm = std::make_unique<llm_build_bloom>(*this, params);
  5569. } break;
  5570. case LLM_ARCH_MPT:
  5571. {
  5572. llm = std::make_unique<llm_build_mpt>(*this, params);
  5573. } break;
  5574. case LLM_ARCH_STABLELM:
  5575. {
  5576. llm = std::make_unique<llm_build_stablelm>(*this, params);
  5577. } break;
  5578. case LLM_ARCH_QWEN:
  5579. {
  5580. llm = std::make_unique<llm_build_qwen>(*this, params);
  5581. } break;
  5582. case LLM_ARCH_QWEN2:
  5583. {
  5584. llm = std::make_unique<llm_build_qwen2>(*this, params);
  5585. } break;
  5586. case LLM_ARCH_DREAM:
  5587. {
  5588. llm = std::make_unique<llm_build_dream>(*this, params);
  5589. }
  5590. break;
  5591. case LLM_ARCH_LLADA:
  5592. {
  5593. llm = std::make_unique<llm_build_llada>(*this, params);
  5594. }
  5595. break;
  5596. case LLM_ARCH_LLADA_MOE:
  5597. {
  5598. llm = std::make_unique<llm_build_llada_moe>(*this, params);
  5599. }
  5600. break;
  5601. case LLM_ARCH_QWEN2VL:
  5602. {
  5603. llm = std::make_unique<llm_build_qwen2vl>(*this, params);
  5604. } break;
  5605. case LLM_ARCH_QWEN2MOE:
  5606. {
  5607. llm = std::make_unique<llm_build_qwen2moe>(*this, params);
  5608. } break;
  5609. case LLM_ARCH_QWEN3:
  5610. {
  5611. llm = std::make_unique<llm_build_qwen3>(*this, params);
  5612. } break;
  5613. case LLM_ARCH_QWEN3MOE:
  5614. {
  5615. llm = std::make_unique<llm_build_qwen3moe>(*this, params);
  5616. } break;
  5617. case LLM_ARCH_PHI2:
  5618. {
  5619. llm = std::make_unique<llm_build_phi2>(*this, params);
  5620. } break;
  5621. case LLM_ARCH_PHI3:
  5622. case LLM_ARCH_PHIMOE:
  5623. {
  5624. if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
  5625. llm = std::make_unique<llm_build_phi3<true>> (*this, params);
  5626. } else {
  5627. llm = std::make_unique<llm_build_phi3<false>>(*this, params);
  5628. }
  5629. } break;
  5630. case LLM_ARCH_PLAMO:
  5631. {
  5632. llm = std::make_unique<llm_build_plamo>(*this, params);
  5633. } break;
  5634. case LLM_ARCH_PLAMO2:
  5635. {
  5636. llm = std::make_unique<llm_build_plamo2>(*this, params);
  5637. } break;
  5638. case LLM_ARCH_GPT2:
  5639. {
  5640. llm = std::make_unique<llm_build_gpt2>(*this, params);
  5641. } break;
  5642. case LLM_ARCH_CODESHELL:
  5643. {
  5644. llm = std::make_unique<llm_build_codeshell>(*this, params);
  5645. } break;
  5646. case LLM_ARCH_ORION:
  5647. {
  5648. llm = std::make_unique<llm_build_orion>(*this, params);
  5649. } break;
  5650. case LLM_ARCH_INTERNLM2:
  5651. {
  5652. llm = std::make_unique<llm_build_internlm2>(*this, params);
  5653. } break;
  5654. case LLM_ARCH_MINICPM3:
  5655. {
  5656. llm = std::make_unique<llm_build_minicpm3>(*this, params);
  5657. } break;
  5658. case LLM_ARCH_GEMMA:
  5659. {
  5660. llm = std::make_unique<llm_build_gemma>(*this, params);
  5661. } break;
  5662. case LLM_ARCH_GEMMA2:
  5663. {
  5664. llm = std::make_unique<llm_build_gemma2_iswa>(*this, params);
  5665. } break;
  5666. case LLM_ARCH_GEMMA3:
  5667. {
  5668. llm = std::make_unique<llm_build_gemma3_iswa>(*this, params);
  5669. } break;
  5670. case LLM_ARCH_GEMMA3N:
  5671. {
  5672. llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
  5673. } break;
  5674. case LLM_ARCH_GEMMA_EMBEDDING:
  5675. {
  5676. llm = std::make_unique<llm_build_gemma_embedding_iswa>(*this, params);
  5677. } break;
  5678. case LLM_ARCH_STARCODER2:
  5679. {
  5680. llm = std::make_unique<llm_build_starcoder2>(*this, params);
  5681. } break;
  5682. case LLM_ARCH_MAMBA:
  5683. case LLM_ARCH_MAMBA2:
  5684. {
  5685. llm = std::make_unique<llm_build_mamba>(*this, params);
  5686. } break;
  5687. case LLM_ARCH_JAMBA:
  5688. {
  5689. llm = std::make_unique<llm_build_jamba>(*this, params);
  5690. } break;
  5691. case LLM_ARCH_XVERSE:
  5692. {
  5693. llm = std::make_unique<llm_build_xverse>(*this, params);
  5694. } break;
  5695. case LLM_ARCH_COMMAND_R:
  5696. {
  5697. llm = std::make_unique<llm_build_command_r>(*this, params);
  5698. } break;
  5699. case LLM_ARCH_COHERE2:
  5700. {
  5701. llm = std::make_unique<llm_build_cohere2_iswa>(*this, params);
  5702. } break;
  5703. case LLM_ARCH_DBRX:
  5704. {
  5705. llm = std::make_unique<llm_build_dbrx>(*this, params);
  5706. } break;
  5707. case LLM_ARCH_OLMO:
  5708. {
  5709. llm = std::make_unique<llm_build_olmo>(*this, params);
  5710. } break;
  5711. case LLM_ARCH_OLMO2:
  5712. {
  5713. if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
  5714. llm = std::make_unique<llm_build_olmo2<true>>(*this, params);
  5715. } else {
  5716. llm = std::make_unique<llm_build_olmo2<false>>(*this, params);
  5717. }
  5718. } break;
  5719. case LLM_ARCH_OLMOE:
  5720. {
  5721. llm = std::make_unique<llm_build_olmoe>(*this, params);
  5722. } break;
  5723. case LLM_ARCH_OPENELM:
  5724. {
  5725. llm = std::make_unique<llm_build_openelm>(*this, params);
  5726. } break;
  5727. case LLM_ARCH_GPTNEOX:
  5728. {
  5729. llm = std::make_unique<llm_build_gptneox>(*this, params);
  5730. } break;
  5731. case LLM_ARCH_ARCTIC:
  5732. {
  5733. llm = std::make_unique<llm_build_arctic>(*this, params);
  5734. } break;
  5735. case LLM_ARCH_DEEPSEEK:
  5736. {
  5737. llm = std::make_unique<llm_build_deepseek>(*this, params);
  5738. } break;
  5739. case LLM_ARCH_DEEPSEEK2:
  5740. {
  5741. llm = std::make_unique<llm_build_deepseek2>(*this, params);
  5742. } break;
  5743. case LLM_ARCH_CHATGLM:
  5744. {
  5745. llm = std::make_unique<llm_build_chatglm>(*this, params);
  5746. } break;
  5747. case LLM_ARCH_GLM4:
  5748. {
  5749. llm = std::make_unique<llm_build_glm4>(*this, params);
  5750. } break;
  5751. case LLM_ARCH_GLM4_MOE:
  5752. {
  5753. llm = std::make_unique<llm_build_glm4_moe>(*this, params);
  5754. } break;
  5755. case LLM_ARCH_BITNET:
  5756. {
  5757. llm = std::make_unique<llm_build_bitnet>(*this, params);
  5758. } break;
  5759. case LLM_ARCH_T5:
  5760. {
  5761. switch (params.gtype) {
  5762. case LLM_GRAPH_TYPE_ENCODER:
  5763. llm = std::make_unique<llm_build_t5_enc>(*this, params);
  5764. break;
  5765. case LLM_GRAPH_TYPE_DEFAULT:
  5766. case LLM_GRAPH_TYPE_DECODER:
  5767. llm = std::make_unique<llm_build_t5_dec>(*this, params);
  5768. break;
  5769. default:
  5770. GGML_ABORT("invalid graph type");
  5771. };
  5772. } break;
  5773. case LLM_ARCH_T5ENCODER:
  5774. {
  5775. llm = std::make_unique<llm_build_t5_enc>(*this, params);
  5776. }
  5777. break;
  5778. case LLM_ARCH_JAIS:
  5779. {
  5780. llm = std::make_unique<llm_build_jais>(*this, params);
  5781. } break;
  5782. case LLM_ARCH_NEMOTRON:
  5783. {
  5784. llm = std::make_unique<llm_build_nemotron>(*this, params);
  5785. } break;
  5786. case LLM_ARCH_NEMOTRON_H:
  5787. {
  5788. llm = std::make_unique<llm_build_nemotron_h>(*this, params);
  5789. } break;
  5790. case LLM_ARCH_EXAONE:
  5791. {
  5792. llm = std::make_unique<llm_build_exaone>(*this, params);
  5793. } break;
  5794. case LLM_ARCH_EXAONE4:
  5795. {
  5796. if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
  5797. llm = std::make_unique<llm_build_exaone4<true>>(*this, params);
  5798. } else {
  5799. llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
  5800. }
  5801. } break;
  5802. case LLM_ARCH_RWKV6:
  5803. {
  5804. llm = std::make_unique<llm_build_rwkv6>(*this, params);
  5805. } break;
  5806. case LLM_ARCH_RWKV6QWEN2:
  5807. {
  5808. llm = std::make_unique<llm_build_rwkv6qwen2>(*this, params);
  5809. } break;
  5810. case LLM_ARCH_RWKV7:
  5811. {
  5812. llm = std::make_unique<llm_build_rwkv7>(*this, params);
  5813. } break;
  5814. case LLM_ARCH_ARWKV7:
  5815. {
  5816. llm = std::make_unique<llm_build_arwkv7>(*this, params);
  5817. } break;
  5818. case LLM_ARCH_GRANITE:
  5819. case LLM_ARCH_GRANITE_MOE:
  5820. case LLM_ARCH_MINICPM:
  5821. {
  5822. llm = std::make_unique<llm_build_granite>(*this, params);
  5823. } break;
  5824. case LLM_ARCH_GRANITE_HYBRID:
  5825. {
  5826. llm = std::make_unique<llm_build_granite_hybrid>(*this, params);
  5827. } break;
  5828. case LLM_ARCH_CHAMELEON:
  5829. {
  5830. llm = std::make_unique<llm_build_chameleon>(*this, params);
  5831. } break;
  5832. case LLM_ARCH_WAVTOKENIZER_DEC:
  5833. {
  5834. llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
  5835. } break;
  5836. case LLM_ARCH_PLM:
  5837. {
  5838. llm = std::make_unique<llm_build_plm>(*this, params);
  5839. } break;
  5840. case LLM_ARCH_BAILINGMOE:
  5841. {
  5842. llm = std::make_unique<llm_build_bailingmoe>(*this, params);
  5843. } break;
  5844. case LLM_ARCH_SEED_OSS:
  5845. {
  5846. llm = std::make_unique<llm_build_seed_oss>(*this, params);
  5847. } break;
  5848. case LLM_ARCH_DOTS1:
  5849. {
  5850. llm = std::make_unique<llm_build_dots1>(*this, params);
  5851. } break;
  5852. case LLM_ARCH_ARCEE:
  5853. {
  5854. llm = std::make_unique<llm_build_arcee>(*this, params);
  5855. } break;
  5856. case LLM_ARCH_ERNIE4_5:
  5857. {
  5858. llm = std::make_unique<llm_build_ernie4_5>(*this, params);
  5859. } break;
  5860. case LLM_ARCH_ERNIE4_5_MOE:
  5861. {
  5862. llm = std::make_unique<llm_build_ernie4_5_moe>(*this, params);
  5863. } break;
  5864. case LLM_ARCH_HUNYUAN_MOE:
  5865. {
  5866. llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
  5867. } break;
  5868. case LLM_ARCH_HUNYUAN_DENSE:
  5869. {
  5870. llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
  5871. } break;
  5872. case LLM_ARCH_SMOLLM3:
  5873. {
  5874. llm = std::make_unique<llm_build_smollm3>(*this, params);
  5875. } break;
  5876. case LLM_ARCH_OPENAI_MOE:
  5877. {
  5878. llm = std::make_unique<llm_build_openai_moe_iswa>(*this, params);
  5879. } break;
  5880. case LLM_ARCH_FALCON_H1:
  5881. {
  5882. llm = std::make_unique<llm_build_falcon_h1>(*this, params);
  5883. } break;
  5884. case LLM_ARCH_LFM2:
  5885. {
  5886. llm = std::make_unique<llm_build_lfm2>(*this, params);
  5887. } break;
  5888. case LLM_ARCH_SMALLTHINKER:
  5889. {
  5890. if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
  5891. llm = std::make_unique<llm_build_smallthinker<true>> (*this, params);
  5892. } else {
  5893. llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
  5894. }
  5895. } break;
  5896. case LLM_ARCH_QWEN3NEXT:
  5897. {
  5898. llm = std::make_unique<llm_build_qwen3next>(*this, params);
  5899. } break;
  5900. default:
  5901. GGML_ABORT("fatal error");
  5902. }
  5903. // add on pooling layer
  5904. llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
  5905. return llm->res->get_gf();
  5906. }
  5907. //
  5908. // interface implementation
  5909. //
  5910. llama_model_params llama_model_default_params() {
  5911. llama_model_params result = {
  5912. /*.devices =*/ nullptr,
  5913. /*.tensor_buft_overrides =*/ nullptr,
  5914. /*.n_gpu_layers =*/ 999,
  5915. /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
  5916. /*.main_gpu =*/ 0,
  5917. /*.tensor_split =*/ nullptr,
  5918. /*.progress_callback =*/ nullptr,
  5919. /*.progress_callback_user_data =*/ nullptr,
  5920. /*.kv_overrides =*/ nullptr,
  5921. /*.vocab_only =*/ false,
  5922. /*.use_mmap =*/ true,
  5923. /*.use_mlock =*/ false,
  5924. /*.check_tensors =*/ false,
  5925. /*.use_extra_bufts =*/ true,
  5926. };
  5927. return result;
  5928. }
  5929. const llama_vocab * llama_model_get_vocab(const llama_model * model) {
  5930. return &model->vocab;
  5931. }
  5932. void llama_free_model(llama_model * model) {
  5933. llama_model_free(model);
  5934. }
  5935. void llama_model_free(llama_model * model) {
  5936. delete model;
  5937. }
  5938. int32_t llama_model_n_ctx_train(const llama_model * model) {
  5939. return model->hparams.n_ctx_train;
  5940. }
  5941. int32_t llama_model_n_embd(const llama_model * model) {
  5942. return model->hparams.n_embd;
  5943. }
  5944. int32_t llama_model_n_layer(const llama_model * model) {
  5945. return model->hparams.n_layer;
  5946. }
  5947. int32_t llama_model_n_head(const llama_model * model) {
  5948. return model->hparams.n_head();
  5949. }
  5950. int32_t llama_model_n_head_kv(const llama_model * model) {
  5951. return model->hparams.n_head_kv();
  5952. }
  5953. int32_t llama_model_n_swa(const llama_model * model) {
  5954. return model->hparams.n_swa;
  5955. }
  5956. uint32_t llama_model_n_cls_out(const struct llama_model * model) {
  5957. return model->hparams.n_cls_out;
  5958. }
  5959. const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
  5960. if (i < model->classifier_labels.size()) {
  5961. return model->classifier_labels[i].c_str();
  5962. }
  5963. return nullptr;
  5964. }
  5965. // deprecated
  5966. int32_t llama_n_ctx_train(const llama_model * model) {
  5967. return llama_model_n_ctx_train(model);
  5968. }
  5969. // deprecated
  5970. int32_t llama_n_embd(const llama_model * model) {
  5971. return llama_model_n_embd(model);
  5972. }
  5973. // deprecated
  5974. int32_t llama_n_layer(const llama_model * model) {
  5975. return llama_model_n_layer(model);
  5976. }
  5977. // deprecated
  5978. int32_t llama_n_head(const llama_model * model) {
  5979. return llama_model_n_head(model);
  5980. }
  5981. llama_rope_type llama_model_rope_type(const llama_model * model) {
  5982. switch (model->arch) {
  5983. // these models do not use RoPE
  5984. case LLM_ARCH_GPT2:
  5985. case LLM_ARCH_GPTJ:
  5986. case LLM_ARCH_MPT:
  5987. case LLM_ARCH_REFACT:
  5988. case LLM_ARCH_BLOOM:
  5989. case LLM_ARCH_MAMBA:
  5990. case LLM_ARCH_MAMBA2:
  5991. case LLM_ARCH_JAMBA:
  5992. case LLM_ARCH_JINA_BERT_V2:
  5993. case LLM_ARCH_T5:
  5994. case LLM_ARCH_T5ENCODER:
  5995. case LLM_ARCH_JAIS:
  5996. case LLM_ARCH_RWKV6:
  5997. case LLM_ARCH_RWKV6QWEN2:
  5998. case LLM_ARCH_RWKV7:
  5999. case LLM_ARCH_ARWKV7:
  6000. case LLM_ARCH_WAVTOKENIZER_DEC:
  6001. case LLM_ARCH_NEMOTRON_H:
  6002. return LLAMA_ROPE_TYPE_NONE;
  6003. // use what we call a normal RoPE, operating on pairs of consecutive head values
  6004. case LLM_ARCH_LLAMA:
  6005. case LLM_ARCH_LLADA:
  6006. case LLM_ARCH_LLAMA4:
  6007. case LLM_ARCH_DECI:
  6008. case LLM_ARCH_BAICHUAN:
  6009. case LLM_ARCH_STARCODER:
  6010. case LLM_ARCH_INTERNLM2:
  6011. case LLM_ARCH_MINICPM:
  6012. case LLM_ARCH_XVERSE:
  6013. case LLM_ARCH_COMMAND_R:
  6014. case LLM_ARCH_COHERE2:
  6015. case LLM_ARCH_OLMO:
  6016. case LLM_ARCH_ARCTIC:
  6017. case LLM_ARCH_DEEPSEEK:
  6018. case LLM_ARCH_DEEPSEEK2:
  6019. case LLM_ARCH_PLM:
  6020. case LLM_ARCH_CHATGLM:
  6021. case LLM_ARCH_GLM4:
  6022. case LLM_ARCH_GRANITE:
  6023. case LLM_ARCH_GRANITE_MOE:
  6024. case LLM_ARCH_GRANITE_HYBRID:
  6025. case LLM_ARCH_CHAMELEON:
  6026. case LLM_ARCH_BAILINGMOE:
  6027. case LLM_ARCH_NEO_BERT:
  6028. case LLM_ARCH_SMOLLM3:
  6029. case LLM_ARCH_ARCEE:
  6030. case LLM_ARCH_ERNIE4_5:
  6031. case LLM_ARCH_ERNIE4_5_MOE:
  6032. case LLM_ARCH_QWEN3NEXT:
  6033. return LLAMA_ROPE_TYPE_NORM;
  6034. // the pairs of head values are offset by n_rot/2
  6035. case LLM_ARCH_FALCON:
  6036. case LLM_ARCH_FALCON_H1:
  6037. case LLM_ARCH_GROK:
  6038. case LLM_ARCH_DBRX:
  6039. case LLM_ARCH_BERT:
  6040. case LLM_ARCH_JINA_BERT_V3:
  6041. case LLM_ARCH_NOMIC_BERT:
  6042. case LLM_ARCH_NOMIC_BERT_MOE:
  6043. case LLM_ARCH_STABLELM:
  6044. case LLM_ARCH_BITNET:
  6045. case LLM_ARCH_QWEN:
  6046. case LLM_ARCH_QWEN2:
  6047. case LLM_ARCH_DREAM:
  6048. case LLM_ARCH_QWEN2MOE:
  6049. case LLM_ARCH_QWEN3:
  6050. case LLM_ARCH_QWEN3MOE:
  6051. case LLM_ARCH_LLADA_MOE:
  6052. case LLM_ARCH_OLMO2:
  6053. case LLM_ARCH_OLMOE:
  6054. case LLM_ARCH_PHI2:
  6055. case LLM_ARCH_PHI3:
  6056. case LLM_ARCH_PHIMOE:
  6057. case LLM_ARCH_PLAMO:
  6058. case LLM_ARCH_PLAMO2:
  6059. case LLM_ARCH_GEMMA:
  6060. case LLM_ARCH_GEMMA2:
  6061. case LLM_ARCH_GEMMA3:
  6062. case LLM_ARCH_GEMMA3N:
  6063. case LLM_ARCH_GEMMA_EMBEDDING:
  6064. case LLM_ARCH_STARCODER2:
  6065. case LLM_ARCH_OPENELM:
  6066. case LLM_ARCH_GPTNEOX:
  6067. case LLM_ARCH_CODESHELL:
  6068. case LLM_ARCH_ORION:
  6069. case LLM_ARCH_NEMOTRON:
  6070. case LLM_ARCH_EXAONE:
  6071. case LLM_ARCH_EXAONE4:
  6072. case LLM_ARCH_MINICPM3:
  6073. case LLM_ARCH_DOTS1:
  6074. case LLM_ARCH_HUNYUAN_MOE:
  6075. case LLM_ARCH_OPENAI_MOE:
  6076. case LLM_ARCH_HUNYUAN_DENSE:
  6077. case LLM_ARCH_LFM2:
  6078. case LLM_ARCH_SMALLTHINKER:
  6079. case LLM_ARCH_GLM4_MOE:
  6080. case LLM_ARCH_SEED_OSS:
  6081. return LLAMA_ROPE_TYPE_NEOX;
  6082. case LLM_ARCH_QWEN2VL:
  6083. return LLAMA_ROPE_TYPE_MROPE;
  6084. // all model arches should be listed explicitly here
  6085. case LLM_ARCH_UNKNOWN:
  6086. GGML_ABORT("unknown architecture");
  6087. }
  6088. return LLAMA_ROPE_TYPE_NONE;
  6089. }
  6090. float llama_model_rope_freq_scale_train(const llama_model * model) {
  6091. return model->hparams.rope_freq_scale_train;
  6092. }
  6093. int32_t llama_model_meta_val_str(const llama_model * model, const char * key, char * buf, size_t buf_size) {
  6094. const auto & it = model->gguf_kv.find(key);
  6095. if (it == model->gguf_kv.end()) {
  6096. if (buf_size > 0) {
  6097. buf[0] = '\0';
  6098. }
  6099. return -1;
  6100. }
  6101. return snprintf(buf, buf_size, "%s", it->second.c_str());
  6102. }
  6103. int32_t llama_model_meta_count(const llama_model * model) {
  6104. return (int)model->gguf_kv.size();
  6105. }
  6106. int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
  6107. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  6108. if (buf_size > 0) {
  6109. buf[0] = '\0';
  6110. }
  6111. return -1;
  6112. }
  6113. auto it = model->gguf_kv.begin();
  6114. std::advance(it, i);
  6115. return snprintf(buf, buf_size, "%s", it->first.c_str());
  6116. }
  6117. int32_t llama_model_meta_val_str_by_index(const llama_model * model, int32_t i, char * buf, size_t buf_size) {
  6118. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  6119. if (buf_size > 0) {
  6120. buf[0] = '\0';
  6121. }
  6122. return -1;
  6123. }
  6124. auto it = model->gguf_kv.begin();
  6125. std::advance(it, i);
  6126. return snprintf(buf, buf_size, "%s", it->second.c_str());
  6127. }
  6128. int32_t llama_model_desc(const llama_model * model, char * buf, size_t buf_size) {
  6129. return snprintf(buf, buf_size, "%s", model->desc().c_str());
  6130. }
  6131. uint64_t llama_model_size(const llama_model * model) {
  6132. return model->size();
  6133. }
  6134. const char * llama_model_chat_template(const llama_model * model, const char * name) {
  6135. const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
  6136. : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
  6137. const auto & it = model->gguf_kv.find(key);
  6138. if (it == model->gguf_kv.end()) {
  6139. // one-off fix for very popular models (so we are not flooded with issues)
  6140. // do not extend this list unless absolutely necessary
  6141. // Mistral-Small-2503 does not have built-in chat template
  6142. llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
  6143. if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
  6144. return "mistral-v7-tekken";
  6145. }
  6146. return nullptr;
  6147. }
  6148. return it->second.c_str();
  6149. }
  6150. uint64_t llama_model_n_params(const llama_model * model) {
  6151. return model->n_elements();
  6152. }
  6153. bool llama_model_has_encoder(const llama_model * model) {
  6154. switch (model->arch) {
  6155. case LLM_ARCH_T5: return true;
  6156. case LLM_ARCH_T5ENCODER: return true;
  6157. default: return false;
  6158. }
  6159. }
  6160. bool llama_model_has_decoder(const llama_model * model) {
  6161. switch (model->arch) {
  6162. case LLM_ARCH_T5ENCODER: return false;
  6163. default: return true;
  6164. }
  6165. }
  6166. llama_token llama_model_decoder_start_token(const llama_model * model) {
  6167. return model->hparams.dec_start_token_id;
  6168. }
  6169. bool llama_model_is_recurrent(const llama_model * model) {
  6170. return llm_arch_is_recurrent(model->arch);
  6171. }
  6172. bool llama_model_is_diffusion(const llama_model * model) {
  6173. return llm_arch_is_diffusion(model->arch);
  6174. }
  6175. const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
  6176. return model->tensors_by_name;
  6177. }