llama-model.cpp 432 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796
  1. #include "llama-model.h"
  2. #include "llama-impl.h"
  3. #include "llama-mmap.h"
  4. #include "llama-batch.h"
  5. #include "llama-cparams.h"
  6. #include "llama-model-loader.h"
  7. #include "llama-kv-cache.h"
  8. #include "llama-kv-cache-iswa.h"
  9. #include "llama-memory-hybrid.h"
  10. #include "llama-memory-recurrent.h"
  11. #include "ggml-cpp.h"
  12. #include "models/models.h"
  13. #include <algorithm>
  14. #include <cassert>
  15. #include <cfloat>
  16. #include <cstring>
  17. #include <cmath>
  18. #include <functional>
  19. #include <map>
  20. #include <regex>
  21. #include <sstream>
  22. #include <stdexcept>
  23. const char * llm_type_name(llm_type type) {
  24. switch (type) {
  25. case LLM_TYPE_14M: return "14M";
  26. case LLM_TYPE_17M: return "17M";
  27. case LLM_TYPE_22M: return "22M";
  28. case LLM_TYPE_33M: return "33M";
  29. case LLM_TYPE_60M: return "60M";
  30. case LLM_TYPE_70M: return "70M";
  31. case LLM_TYPE_80M: return "80M";
  32. case LLM_TYPE_109M: return "109M";
  33. case LLM_TYPE_137M: return "137M";
  34. case LLM_TYPE_140M: return "140M";
  35. case LLM_TYPE_160M: return "160M";
  36. case LLM_TYPE_190M: return "190M";
  37. case LLM_TYPE_220M: return "220M";
  38. case LLM_TYPE_250M: return "250M";
  39. case LLM_TYPE_256M: return "256M";
  40. case LLM_TYPE_270M: return "270M";
  41. case LLM_TYPE_335M: return "335M";
  42. case LLM_TYPE_350M: return "350M";
  43. case LLM_TYPE_360M: return "360M";
  44. case LLM_TYPE_410M: return "410M";
  45. case LLM_TYPE_450M: return "450M";
  46. case LLM_TYPE_475M: return "475M";
  47. case LLM_TYPE_558M: return "558M";
  48. case LLM_TYPE_700M: return "700M";
  49. case LLM_TYPE_770M: return "770M";
  50. case LLM_TYPE_780M: return "780M";
  51. case LLM_TYPE_950M: return "950M";
  52. case LLM_TYPE_0_3B: return "0.3B";
  53. case LLM_TYPE_0_5B: return "0.5B";
  54. case LLM_TYPE_0_6B: return "0.6B";
  55. case LLM_TYPE_1B: return "1B";
  56. case LLM_TYPE_1_2B: return "1.2B";
  57. case LLM_TYPE_1_3B: return "1.3B";
  58. case LLM_TYPE_1_4B: return "1.4B";
  59. case LLM_TYPE_1_5B: return "1.5B";
  60. case LLM_TYPE_1_6B: return "1.6B";
  61. case LLM_TYPE_1_7B: return "1.7B";
  62. case LLM_TYPE_1_8B: return "1.8B";
  63. case LLM_TYPE_2B: return "2B";
  64. case LLM_TYPE_2_6B: return "2.6B";
  65. case LLM_TYPE_2_8B: return "2.8B";
  66. case LLM_TYPE_2_9B: return "2.9B";
  67. case LLM_TYPE_3B: return "3B";
  68. case LLM_TYPE_4B: return "4B";
  69. case LLM_TYPE_6B: return "6B";
  70. case LLM_TYPE_6_9B: return "6.9B";
  71. case LLM_TYPE_7B: return "7B";
  72. case LLM_TYPE_8B: return "8B";
  73. case LLM_TYPE_9B: return "9B";
  74. case LLM_TYPE_11B: return "11B";
  75. case LLM_TYPE_12B: return "12B";
  76. case LLM_TYPE_13B: return "13B";
  77. case LLM_TYPE_14B: return "14B";
  78. case LLM_TYPE_15B: return "15B";
  79. case LLM_TYPE_16B: return "16B";
  80. case LLM_TYPE_20B: return "20B";
  81. case LLM_TYPE_26B: return "26B";
  82. case LLM_TYPE_27B: return "27B";
  83. case LLM_TYPE_30B: return "30B";
  84. case LLM_TYPE_32B: return "32B";
  85. case LLM_TYPE_34B: return "34B";
  86. case LLM_TYPE_35B: return "35B";
  87. case LLM_TYPE_36B: return "36B";
  88. case LLM_TYPE_40B: return "40B";
  89. case LLM_TYPE_65B: return "65B";
  90. case LLM_TYPE_70B: return "70B";
  91. case LLM_TYPE_120B: return "120B";
  92. case LLM_TYPE_142B: return "142B";
  93. case LLM_TYPE_236B: return "236B";
  94. case LLM_TYPE_290B: return "290B";
  95. case LLM_TYPE_314B: return "314B";
  96. case LLM_TYPE_405B: return "405B";
  97. case LLM_TYPE_671B: return "671B";
  98. case LLM_TYPE_SMALL: return "0.1B";
  99. case LLM_TYPE_MEDIUM: return "0.4B";
  100. case LLM_TYPE_LARGE: return "0.8B";
  101. case LLM_TYPE_XL: return "1.5B";
  102. case LLM_TYPE_A1_7B: return "A1.7B";
  103. case LLM_TYPE_A2_7B: return "A2.7B";
  104. case LLM_TYPE_8x7B: return "8x7B";
  105. case LLM_TYPE_8x22B: return "8x22B";
  106. case LLM_TYPE_16x12B: return "16x12B";
  107. case LLM_TYPE_16x3_8B: return "16x3.8B";
  108. case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
  109. case LLM_TYPE_57B_A14B: return "57B.A14B";
  110. case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
  111. case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
  112. case LLM_TYPE_A13B: return "A13B";
  113. case LLM_TYPE_7B_A1B: return "7B.A1B";
  114. case LLM_TYPE_8B_A1B: return "8B.A1B";
  115. case LLM_TYPE_16B_A1B: return "16B.A1B";
  116. case LLM_TYPE_21B_A3B: return "21B.A3B";
  117. case LLM_TYPE_30B_A3B: return "30B.A3B";
  118. case LLM_TYPE_100B_A6B: return "100B.A6B";
  119. case LLM_TYPE_106B_A12B: return "106B.A12B";
  120. case LLM_TYPE_230B_A10B: return "230B.A10B";
  121. case LLM_TYPE_235B_A22B: return "235B.A22B";
  122. case LLM_TYPE_300B_A47B: return "300B.A47B";
  123. case LLM_TYPE_355B_A32B: return "355B.A32B";
  124. case LLM_TYPE_E2B: return "E2B";
  125. case LLM_TYPE_E4B: return "E4B";
  126. default: return "?B";
  127. }
  128. }
  129. static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
  130. switch (type) {
  131. case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
  132. case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
  133. default: return "unknown";
  134. }
  135. }
  136. static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
  137. { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
  138. { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
  139. { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
  140. { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
  141. };
  142. std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
  143. return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
  144. }
  145. static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
  146. for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
  147. if (kv.second == name) {
  148. return (llama_rope_scaling_type) kv.first;
  149. }
  150. }
  151. return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
  152. }
  153. // checks if the weight tensor can be used with the specified buffer type and device
  154. static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
  155. GGML_ASSERT(w != nullptr);
  156. if (op == GGML_OP_NONE) {
  157. return true;
  158. }
  159. ggml_init_params params = {
  160. /*.mem_size =*/ ggml_tensor_overhead()*8,
  161. /*.mem_buffer =*/ NULL,
  162. /*.no_alloc =*/ true,
  163. };
  164. ggml_context_ptr ctx_ptr { ggml_init(params) };
  165. if (!ctx_ptr) {
  166. throw std::runtime_error(format("failed to create ggml context"));
  167. }
  168. ggml_context * ctx = ctx_ptr.get();
  169. ggml_tensor * op_tensor = nullptr;
  170. switch (op) {
  171. case GGML_OP_GET_ROWS:
  172. {
  173. ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
  174. op_tensor = ggml_get_rows(ctx, w, b);
  175. } break;
  176. case GGML_OP_MUL_MAT:
  177. {
  178. ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
  179. op_tensor = ggml_mul_mat(ctx, w, b);
  180. } break;
  181. case GGML_OP_MUL_MAT_ID:
  182. {
  183. int n_expert_used = hparams.n_expert_used;
  184. ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
  185. ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
  186. op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
  187. } break;
  188. case GGML_OP_ADD:
  189. {
  190. ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
  191. op_tensor = ggml_add(ctx, a, w);
  192. } break;
  193. case GGML_OP_ADD_ID:
  194. {
  195. int n_expert_used = hparams.n_expert_used;
  196. ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
  197. ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
  198. op_tensor = ggml_add_id(ctx, a, w, c);
  199. } break;
  200. case GGML_OP_MUL:
  201. {
  202. ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
  203. op_tensor = ggml_mul(ctx, a, w);
  204. } break;
  205. case GGML_OP_DIV:
  206. {
  207. ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
  208. op_tensor = ggml_div(ctx, a, w);
  209. } break;
  210. case GGML_OP_ROPE:
  211. {
  212. int n_embd_head = hparams.n_embd_head_v;
  213. int n_head = hparams.n_head();
  214. ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
  215. ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
  216. op_tensor = ggml_rope_ext(
  217. ctx, a, b, w,
  218. 0, 0, 0, 0, 0,
  219. 0, 0, 0, 0
  220. );
  221. } break;
  222. case GGML_OP_SSM_CONV:
  223. {
  224. const int64_t n_seq_tokens = 512;
  225. const int64_t n_seqs = 3;
  226. ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
  227. op_tensor = ggml_ssm_conv(ctx, conv_x, w);
  228. } break;
  229. case GGML_OP_SSM_SCAN:
  230. {
  231. // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
  232. const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
  233. const int64_t n_head = w->ne[1];
  234. const int64_t head_dim = hparams.ssm_d_inner / n_head;
  235. const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
  236. const int64_t n_seq_tokens = 512;
  237. const int64_t n_seqs = 3;
  238. ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
  239. ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
  240. ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
  241. ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
  242. ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
  243. ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
  244. op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
  245. } break;
  246. case GGML_OP_RWKV_WKV6:
  247. {
  248. // FIXME
  249. const int64_t S = 123;
  250. const int64_t H = 123;
  251. const int64_t n_tokens = 123;
  252. const int64_t n_seqs = 123;
  253. ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  254. ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  255. ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  256. ggml_tensor * tf = w;
  257. ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  258. ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
  259. op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
  260. } break;
  261. case GGML_OP_IM2COL:
  262. {
  263. const int n_embd_inp = hparams.n_embd_inp();
  264. ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
  265. op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
  266. } break;
  267. case GGML_OP_SCALE:
  268. {
  269. op_tensor = ggml_scale(ctx, w, 1.0f);
  270. } break;
  271. default:
  272. GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
  273. }
  274. // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
  275. GGML_ASSERT(w->buffer == nullptr);
  276. w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
  277. bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  278. ggml_backend_buffer_free(w->buffer);
  279. w->buffer = nullptr;
  280. return op_supported;
  281. }
  282. // lists of buffer types used for each layer
  283. using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
  284. // find the first buffer type in the list that can use the tensor
  285. static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
  286. GGML_ASSERT(!buft_list.empty());
  287. for (const auto & cur : buft_list) {
  288. ggml_backend_dev_t cur_dev = cur.first;
  289. ggml_backend_buffer_type_t cur_buft = cur.second;
  290. if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
  291. return cur_buft;
  292. }
  293. }
  294. return nullptr;
  295. }
  296. // CPU: ACCEL -> GPU host -> CPU extra -> CPU
  297. static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts, bool no_host) {
  298. buft_list_t buft_list;
  299. // add ACCEL buffer types
  300. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  301. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  302. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
  303. auto * buft = ggml_backend_dev_buffer_type(dev);
  304. // skip
  305. if (buft != ggml_backend_cpu_buffer_type()) {
  306. buft_list.emplace_back(dev, buft);
  307. }
  308. }
  309. }
  310. // add a host buffer type
  311. // storing the tensors in a host buffer is useful when the processing of large batches
  312. // is offloaded to a GPU device, since it reduces the time spent on data transfers
  313. // generally, this will be done using the first device in the list
  314. // a better approach would be to handle this on a weight-by-weight basis using the offload_op
  315. // function of the device to determine if it would benefit from being stored in a host buffer
  316. if (!no_host) {
  317. for (auto * dev : devices) {
  318. ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
  319. if (buft) {
  320. buft_list.emplace_back(dev, buft);
  321. break;
  322. }
  323. }
  324. }
  325. // add extra buffer types
  326. if (use_extra_bufts) {
  327. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  328. if (cpu_dev == nullptr) {
  329. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  330. }
  331. auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
  332. auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
  333. ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
  334. if (ggml_backend_dev_get_extra_bufts_fn) {
  335. ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
  336. while (extra_bufts && *extra_bufts) {
  337. buft_list.emplace_back(cpu_dev, *extra_bufts);
  338. ++extra_bufts;
  339. }
  340. }
  341. }
  342. // add the CPU buffer type
  343. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  344. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  345. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
  346. buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
  347. }
  348. }
  349. return buft_list;
  350. }
  351. // GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
  352. static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode split_mode, const float * tensor_split) {
  353. buft_list_t buft_list;
  354. // add the device split buffer type if requested and available
  355. if (split_mode == LLAMA_SPLIT_MODE_ROW) {
  356. ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
  357. auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
  358. ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
  359. if (ggml_backend_split_buffer_type_fn) {
  360. size_t dev_index = [&]() {
  361. auto * reg = ggml_backend_dev_backend_reg(dev);
  362. for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
  363. if (ggml_backend_reg_dev_get(reg, i) == dev) {
  364. return i;
  365. }
  366. }
  367. throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
  368. }();
  369. auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
  370. if (buft != nullptr) {
  371. buft_list.emplace_back(dev, buft);
  372. }
  373. }
  374. }
  375. // add the device default buffer type
  376. buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
  377. // add the device extra buffer type (if any)
  378. ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
  379. auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
  380. ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts");
  381. if (ggml_backend_dev_get_extra_bufts_fn) {
  382. ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev);
  383. while (extra_bufts && *extra_bufts) {
  384. buft_list.emplace_back(dev, *extra_bufts);
  385. ++extra_bufts;
  386. }
  387. }
  388. return buft_list;
  389. }
  390. struct llama_model::impl {
  391. impl() {}
  392. ~impl() {}
  393. uint64_t n_elements = 0;
  394. size_t n_bytes = 0;
  395. std::string desc_str;
  396. // model memory mapped files
  397. llama_mmaps mappings;
  398. // objects representing data potentially being locked in memory
  399. llama_mlocks mlock_bufs;
  400. llama_mlocks mlock_mmaps;
  401. // contexts where the model tensors metadata is stored as well ass the corresponding buffers:
  402. std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
  403. buft_list_t cpu_buft_list;
  404. std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
  405. struct layer_dev {
  406. ggml_backend_dev_t dev;
  407. buft_list_t * buft_list;
  408. };
  409. layer_dev dev_input = {};
  410. layer_dev dev_output = {};
  411. std::vector<layer_dev> dev_layer;
  412. bool has_tensor_overrides;
  413. };
  414. llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
  415. pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
  416. }
  417. llama_model::~llama_model() {}
  418. void llama_model::load_stats(llama_model_loader & ml) {
  419. pimpl->n_elements = ml.n_elements;
  420. pimpl->n_bytes = ml.n_bytes;
  421. }
  422. void llama_model::load_arch(llama_model_loader & ml) {
  423. arch = ml.get_arch();
  424. if (arch == LLM_ARCH_UNKNOWN) {
  425. throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
  426. }
  427. }
  428. void llama_model::load_hparams(llama_model_loader & ml) {
  429. const gguf_context * ctx = ml.meta.get();
  430. // get metadata as string
  431. for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
  432. gguf_type type = gguf_get_kv_type(ctx, i);
  433. if (type == GGUF_TYPE_ARRAY) {
  434. continue;
  435. }
  436. const char * name = gguf_get_key(ctx, i);
  437. const std::string value = gguf_kv_to_str(ctx, i);
  438. gguf_kv.emplace(name, value);
  439. }
  440. // get general kv
  441. ml.get_key(LLM_KV_GENERAL_NAME, name, false);
  442. // everything past this point is not vocab-related
  443. // for CLIP models, we only need to load tensors, no hparams
  444. if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) {
  445. return;
  446. }
  447. ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
  448. ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
  449. ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
  450. ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
  451. ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
  452. ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false);
  453. ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false);
  454. if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
  455. ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
  456. ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
  457. ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
  458. ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
  459. ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
  460. }
  461. GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
  462. GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
  463. if (hparams.n_expert > 0) {
  464. GGML_ASSERT(hparams.n_expert_used > 0);
  465. GGML_ASSERT(hparams.n_expert_groups < hparams.n_expert);
  466. if (hparams.n_expert_groups > 1) {
  467. GGML_ASSERT(hparams.n_expert % hparams.n_expert_groups == 0);
  468. GGML_ASSERT(hparams.n_group_used > 0);
  469. GGML_ASSERT(hparams.n_group_used < hparams.n_expert_groups);
  470. }
  471. } else {
  472. GGML_ASSERT(hparams.n_expert_used == 0);
  473. GGML_ASSERT(hparams.n_expert_groups == 0);
  474. }
  475. std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
  476. std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
  477. std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
  478. std::fill(
  479. hparams.recurrent_layer_arr.begin(),
  480. hparams.recurrent_layer_arr.end(),
  481. llm_arch_is_recurrent(ml.get_arch()));
  482. std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
  483. std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
  484. std::fill(hparams.xielu_alpha_n.begin(), hparams.xielu_alpha_n.end(), 0.0f);
  485. std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f);
  486. std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f);
  487. std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f);
  488. ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
  489. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
  490. // n_head_kv is optional, default to n_head
  491. hparams.n_head_kv_arr = hparams.n_head_arr;
  492. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
  493. bool rope_finetuned = false;
  494. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  495. hparams.rope_finetuned = rope_finetuned;
  496. hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
  497. ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
  498. // rope_freq_base (optional)
  499. hparams.rope_freq_base_train = 10000.0f;
  500. ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
  501. std::string rope_scaling("linear");
  502. ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
  503. hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
  504. GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
  505. // rope_freq_scale (inverse of the kv) is optional
  506. float ropescale = 0.0f;
  507. if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
  508. // try the old key name
  509. ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
  510. }
  511. hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
  512. // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
  513. hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
  514. hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
  515. ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
  516. // non-transformer models do not have attention heads
  517. if (hparams.n_head() > 0) {
  518. // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
  519. // gpt-j n_rot = rotary_dim
  520. hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
  521. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
  522. hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
  523. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
  524. // sanity check for n_rot (optional)
  525. hparams.n_rot = hparams.n_embd_head_k;
  526. ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
  527. if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
  528. if (hparams.n_rot != hparams.n_embd_head_k) {
  529. throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
  530. }
  531. }
  532. } else {
  533. hparams.n_rot = 0;
  534. hparams.n_embd_head_k = 0;
  535. hparams.n_embd_head_v = 0;
  536. }
  537. // for differentiating model types
  538. uint32_t n_vocab = 0;
  539. ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
  540. // for classifier models
  541. ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
  542. if (!classifier_labels.empty()) {
  543. hparams.n_cls_out = classifier_labels.size();
  544. }
  545. // arch-specific KVs
  546. switch (arch) {
  547. case LLM_ARCH_LLAMA:
  548. {
  549. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  550. if (hparams.n_expert == 8) {
  551. switch (hparams.n_layer) {
  552. case 32: type = LLM_TYPE_8x7B; break;
  553. case 56: type = LLM_TYPE_8x22B; break;
  554. default: type = LLM_TYPE_UNKNOWN;
  555. }
  556. } else {
  557. switch (hparams.n_layer) {
  558. case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
  559. case 22: type = LLM_TYPE_1B; break;
  560. case 26: type = LLM_TYPE_3B; break;
  561. case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
  562. case 30: type = LLM_TYPE_256M; break; // smoldocling 256M
  563. // granite uses a vocab with len 49152
  564. case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
  565. case 36: type = LLM_TYPE_8B; break; // granite
  566. case 40: type = LLM_TYPE_13B; break;
  567. case 48: type = LLM_TYPE_34B; break;
  568. case 60: type = LLM_TYPE_30B; break;
  569. case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
  570. default: type = LLM_TYPE_UNKNOWN;
  571. }
  572. }
  573. } break;
  574. case LLM_ARCH_LLAMA4:
  575. {
  576. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  577. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  578. ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
  579. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  580. if (found_swa && hparams.n_swa == 0) {
  581. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  582. hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
  583. } else {
  584. hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
  585. hparams.n_swa = 8192;
  586. hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
  587. }
  588. switch (hparams.n_expert) {
  589. case 0: {
  590. // MobileLLM (no MoE)
  591. switch (hparams.n_embd) {
  592. case 2048: type = LLM_TYPE_140M; break;
  593. case 4096: type = LLM_TYPE_360M; break;
  594. case 6144: type = LLM_TYPE_950M; break;
  595. default: type = LLM_TYPE_UNKNOWN;
  596. }
  597. } break;
  598. case 16: type = LLM_TYPE_17B_16E; break;
  599. case 128: type = LLM_TYPE_17B_128E; break;
  600. default: type = LLM_TYPE_UNKNOWN;
  601. }
  602. hparams.use_kq_norm = type != LLM_TYPE_17B_128E;
  603. } break;
  604. case LLM_ARCH_ARCEE:
  605. {
  606. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  607. // Arcee uses the same structure as Llama
  608. switch (hparams.n_layer) {
  609. case 36: type = LLM_TYPE_4B; break;
  610. default: type = LLM_TYPE_UNKNOWN;
  611. }
  612. } break;
  613. case LLM_ARCH_AFMOE:
  614. {
  615. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  616. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  617. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  618. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  619. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  620. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
  621. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  622. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  623. // Set up interleaved sliding window attention (ISWA)
  624. // Pattern: 3 sliding - 1 full (global_attn_every_n_layers = 4)
  625. if (hparams.n_swa > 0) {
  626. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  627. hparams.set_swa_pattern(4);
  628. } else {
  629. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  630. }
  631. // Default to sigmoid if not set
  632. if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
  633. hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
  634. }
  635. switch (hparams.n_layer) {
  636. case 56: type = LLM_TYPE_6B; break;
  637. case 32: type = LLM_TYPE_26B; break;
  638. default: type = LLM_TYPE_UNKNOWN;
  639. }
  640. } break;
  641. case LLM_ARCH_DECI:
  642. {
  643. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  644. switch (hparams.n_layer) {
  645. case 32: type = LLM_TYPE_7B; break;
  646. case 80: type = LLM_TYPE_70B; break;
  647. case 162: type = LLM_TYPE_405B; break;
  648. default: type = LLM_TYPE_UNKNOWN;
  649. }
  650. } break;
  651. case LLM_ARCH_MINICPM:
  652. {
  653. // Backward-compatible defaults for older MiniCPM GGUFs
  654. hparams.f_embedding_scale = 12.0f;
  655. hparams.f_residual_scale = 1.4f / sqrtf(float(hparams.n_layer));
  656. hparams.f_logit_scale = hparams.n_embd ? (256.0f / float(hparams.n_embd)) : 1.0f;
  657. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  658. // Optional KV reads, override defaults if present in newer GGUF exports
  659. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /*required=*/false);
  660. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /*required=*/false);
  661. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /*required=*/false);
  662. // MiniCPM uses rope by default, unlike Granite which uses it as a switch
  663. hparams.rope_finetuned = true;
  664. switch (hparams.n_layer) {
  665. case 52: type = LLM_TYPE_1B; break;
  666. case 40: type = LLM_TYPE_2B; break;
  667. default: type = LLM_TYPE_UNKNOWN;
  668. }
  669. } break;
  670. case LLM_ARCH_MINICPM3:
  671. {
  672. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  673. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  674. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  675. switch (hparams.n_layer) {
  676. case 62: type = LLM_TYPE_4B; break;
  677. default: type = LLM_TYPE_UNKNOWN;
  678. }
  679. } break;
  680. case LLM_ARCH_GROK:
  681. {
  682. // defaults for old GGUFs
  683. hparams.yarn_beta_fast = 8.0f;
  684. hparams.f_logit_scale = 0.5773502691896257f;
  685. hparams.f_embedding_scale = 78.38367176906169f;
  686. hparams.f_attn_out_scale = 0.08838834764831845f;
  687. hparams.f_attn_logit_softcapping = 30.0f;
  688. hparams.f_router_logit_softcapping = 30.0f;
  689. // no final_logit_softcapping in grok-1
  690. hparams.f_final_logit_softcapping = 0.0f;
  691. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  692. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  693. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false);
  694. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false);
  695. ml.get_key(LLM_KV_ATTENTION_OUTPUT_SCALE, hparams.f_attn_out_scale, false);
  696. ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
  697. ml.get_key(LLM_KV_ROUTER_LOGIT_SOFTCAPPING, hparams.f_router_logit_softcapping, false);
  698. ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
  699. ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.attn_temp_length, false);
  700. ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor, false);
  701. ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
  702. ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
  703. ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
  704. switch (hparams.n_layer) {
  705. case 64: type = LLM_TYPE_314B; break;
  706. default: type = LLM_TYPE_UNKNOWN;
  707. }
  708. } break;
  709. case LLM_ARCH_FALCON:
  710. {
  711. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  712. switch (hparams.n_layer) {
  713. case 32: type = LLM_TYPE_7B; break;
  714. case 60: type = LLM_TYPE_40B; break;
  715. default: type = LLM_TYPE_UNKNOWN;
  716. }
  717. } break;
  718. case LLM_ARCH_BAICHUAN:
  719. {
  720. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  721. switch (hparams.n_layer) {
  722. case 32: type = LLM_TYPE_7B; break;
  723. case 40: type = LLM_TYPE_13B; break;
  724. default: type = LLM_TYPE_UNKNOWN;
  725. }
  726. if (type == LLM_TYPE_13B) {
  727. // TODO: become GGUF KV parameter
  728. hparams.f_max_alibi_bias = 8.0f;
  729. }
  730. } break;
  731. case LLM_ARCH_STARCODER:
  732. {
  733. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  734. switch (hparams.n_layer) {
  735. case 24: type = LLM_TYPE_1B; break;
  736. case 36: type = LLM_TYPE_3B; break;
  737. case 42: type = LLM_TYPE_7B; break;
  738. case 40: type = LLM_TYPE_15B; break;
  739. default: type = LLM_TYPE_UNKNOWN;
  740. }
  741. } break;
  742. case LLM_ARCH_REFACT:
  743. {
  744. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  745. switch (hparams.n_layer) {
  746. case 32: type = LLM_TYPE_1B; break;
  747. default: type = LLM_TYPE_UNKNOWN;
  748. }
  749. // TODO: become GGUF KV parameter
  750. hparams.f_max_alibi_bias = 8.0f;
  751. } break;
  752. case LLM_ARCH_BERT:
  753. {
  754. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  755. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  756. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  757. switch (hparams.n_layer) {
  758. case 3:
  759. type = LLM_TYPE_17M; break; // bge-micro
  760. case 6:
  761. type = LLM_TYPE_22M; break; // MiniLM-L6
  762. case 12:
  763. switch (hparams.n_embd) {
  764. case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
  765. case 768: type = LLM_TYPE_109M; break; // bge-base
  766. default: type = LLM_TYPE_UNKNOWN;
  767. } break;
  768. case 24:
  769. type = LLM_TYPE_335M; break; // bge-large
  770. default: type = LLM_TYPE_UNKNOWN;
  771. }
  772. } break;
  773. case LLM_ARCH_JINA_BERT_V2:
  774. {
  775. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  776. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  777. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  778. hparams.f_max_alibi_bias = 8.0f;
  779. switch (hparams.n_layer) {
  780. case 4: type = LLM_TYPE_33M; break; // jina-embeddings-small
  781. case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
  782. default: type = LLM_TYPE_UNKNOWN;
  783. }
  784. } break;
  785. case LLM_ARCH_JINA_BERT_V3:
  786. {
  787. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  788. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  789. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  790. switch (hparams.n_layer) {
  791. case 24:
  792. type = LLM_TYPE_558M; break;
  793. default: type = LLM_TYPE_UNKNOWN;
  794. }
  795. } break;
  796. case LLM_ARCH_NOMIC_BERT:
  797. case LLM_ARCH_NOMIC_BERT_MOE:
  798. {
  799. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  800. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  801. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  802. ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
  803. if (hparams.n_layer == 12 && hparams.n_embd == 768) {
  804. if (arch == LLM_ARCH_NOMIC_BERT) {
  805. type = LLM_TYPE_137M;
  806. } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
  807. type = LLM_TYPE_475M;
  808. }
  809. }
  810. } break;
  811. case LLM_ARCH_NEO_BERT:
  812. {
  813. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  814. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  815. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  816. if (hparams.n_layer == 28) {
  817. type = LLM_TYPE_250M;
  818. }
  819. } break;
  820. case LLM_ARCH_BLOOM:
  821. {
  822. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  823. switch (hparams.n_layer) {
  824. case 24: type = LLM_TYPE_1B; break;
  825. case 30:
  826. switch (hparams.n_embd) {
  827. case 2560: type = LLM_TYPE_3B; break;
  828. case 4096: type = LLM_TYPE_7B; break;
  829. default: type = LLM_TYPE_UNKNOWN;
  830. } break;
  831. default: type = LLM_TYPE_UNKNOWN;
  832. }
  833. // TODO: become GGUF KV parameter
  834. hparams.f_max_alibi_bias = 8.0f;
  835. } break;
  836. case LLM_ARCH_MPT:
  837. {
  838. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  839. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  840. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  841. switch (hparams.n_layer) {
  842. case 32: type = LLM_TYPE_7B; break;
  843. case 48: type = LLM_TYPE_30B; break;
  844. default: type = LLM_TYPE_UNKNOWN;
  845. }
  846. } break;
  847. case LLM_ARCH_STABLELM:
  848. {
  849. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  850. switch (hparams.n_layer) {
  851. case 24: type = LLM_TYPE_1B; break;
  852. case 32: type = LLM_TYPE_3B; break;
  853. case 40: type = LLM_TYPE_12B; break;
  854. default: type = LLM_TYPE_UNKNOWN;
  855. }
  856. } break;
  857. case LLM_ARCH_QWEN:
  858. {
  859. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  860. switch (hparams.n_layer) {
  861. case 32: type = LLM_TYPE_7B; break;
  862. case 40: type = LLM_TYPE_13B; break;
  863. default: type = LLM_TYPE_UNKNOWN;
  864. }
  865. } break;
  866. case LLM_ARCH_QWEN2VL:
  867. {
  868. ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
  869. }
  870. // fall through
  871. case LLM_ARCH_QWEN2:
  872. {
  873. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  874. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  875. switch (hparams.n_layer) {
  876. case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
  877. case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
  878. case 32: type = LLM_TYPE_7B; break;
  879. case 36: type = LLM_TYPE_3B; break;
  880. case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
  881. case 48: type = LLM_TYPE_14B; break;
  882. case 64: type = LLM_TYPE_32B; break;
  883. case 80: type = LLM_TYPE_70B; break;
  884. default: type = LLM_TYPE_UNKNOWN;
  885. }
  886. } break;
  887. case LLM_ARCH_DREAM:
  888. {
  889. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  890. // Dream models are primarily 7B with 28 layers
  891. switch (hparams.n_layer) {
  892. case 28:
  893. type = LLM_TYPE_7B;
  894. break;
  895. default:
  896. type = LLM_TYPE_UNKNOWN;
  897. }
  898. // Set non-causal attention for diffusion models
  899. hparams.causal_attn = false;
  900. }
  901. break;
  902. case LLM_ARCH_LLADA:
  903. {
  904. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  905. // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
  906. switch (hparams.n_layer) {
  907. case 32:
  908. type = LLM_TYPE_8B;
  909. break;
  910. default:
  911. type = LLM_TYPE_UNKNOWN;
  912. }
  913. // Set non-causal attention for diffusion models
  914. hparams.causal_attn = false;
  915. }
  916. break;
  917. case LLM_ARCH_LLADA_MOE:
  918. {
  919. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  920. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  921. // diffusion language model uses non-causal attention
  922. hparams.causal_attn = false;
  923. switch (hparams.n_layer) {
  924. case 16: type = LLM_TYPE_A1_7B; break;
  925. default: type = LLM_TYPE_UNKNOWN;
  926. }
  927. } break;
  928. case LLM_ARCH_RND1:
  929. {
  930. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  931. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  932. switch (hparams.n_layer) {
  933. case 48: type = LLM_TYPE_30B_A3B; break;
  934. default: type = LLM_TYPE_UNKNOWN;
  935. }
  936. // Set non-causal attention for diffusion models
  937. hparams.causal_attn = false;
  938. } break;
  939. case LLM_ARCH_QWEN2MOE:
  940. {
  941. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  942. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  943. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  944. switch (hparams.n_layer) {
  945. case 24: type = LLM_TYPE_A2_7B; break;
  946. case 28: type = LLM_TYPE_57B_A14B; break;
  947. default: type = LLM_TYPE_UNKNOWN;
  948. }
  949. } break;
  950. case LLM_ARCH_QWEN3:
  951. {
  952. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  953. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  954. switch (hparams.n_layer) {
  955. case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
  956. case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
  957. case 40: type = LLM_TYPE_14B; break;
  958. case 64: type = LLM_TYPE_32B; break;
  959. default: type = LLM_TYPE_UNKNOWN;
  960. }
  961. } break;
  962. case LLM_ARCH_QWEN3VL:
  963. {
  964. ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
  965. ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
  966. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  967. switch (hparams.n_layer) {
  968. case 28: type = LLM_TYPE_1_7B; break;
  969. case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
  970. case 64: type = LLM_TYPE_32B; break;
  971. default: type = LLM_TYPE_UNKNOWN;
  972. }
  973. } break;
  974. case LLM_ARCH_QWEN3MOE:
  975. {
  976. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  977. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  978. switch (hparams.n_layer) {
  979. case 48: type = LLM_TYPE_30B_A3B; break;
  980. case 94: type = LLM_TYPE_235B_A22B; break;
  981. default: type = LLM_TYPE_UNKNOWN;
  982. }
  983. } break;
  984. case LLM_ARCH_QWEN3VLMOE:
  985. {
  986. ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
  987. ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
  988. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  989. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  990. switch (hparams.n_layer) {
  991. case 48: type = LLM_TYPE_30B_A3B; break;
  992. case 94: type = LLM_TYPE_235B_A22B; break;
  993. default: type = LLM_TYPE_UNKNOWN;
  994. }
  995. } break;
  996. case LLM_ARCH_PHI2:
  997. {
  998. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  999. switch (hparams.n_layer) {
  1000. case 24: type = LLM_TYPE_1B; break;
  1001. case 32: type = LLM_TYPE_3B; break;
  1002. default: type = LLM_TYPE_UNKNOWN;
  1003. }
  1004. } break;
  1005. case LLM_ARCH_PHI3:
  1006. {
  1007. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1008. switch (hparams.n_layer) {
  1009. case 24: type = LLM_TYPE_1B; break;
  1010. case 32: type = LLM_TYPE_3B; break;
  1011. case 40: type = LLM_TYPE_14B; break;
  1012. default: type = LLM_TYPE_UNKNOWN;
  1013. }
  1014. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1015. if (found_swa && hparams.n_swa > 0) {
  1016. LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
  1017. __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
  1018. // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
  1019. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  1020. hparams.n_swa = 0;
  1021. hparams.set_swa_pattern(1);
  1022. }
  1023. } break;
  1024. case LLM_ARCH_PHIMOE:
  1025. {
  1026. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1027. switch (hparams.n_layer) {
  1028. case 32: type = LLM_TYPE_16x3_8B; break;
  1029. default: type = LLM_TYPE_UNKNOWN;
  1030. }
  1031. } break;
  1032. case LLM_ARCH_PLAMO:
  1033. {
  1034. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1035. switch (hparams.n_layer) {
  1036. case 40: type = LLM_TYPE_13B; break;
  1037. default: type = LLM_TYPE_UNKNOWN;
  1038. }
  1039. } break;
  1040. case LLM_ARCH_PLAMO2:
  1041. {
  1042. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1043. // Load Mamba SSM parameters
  1044. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1045. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1046. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1047. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1048. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1049. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1050. hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
  1051. }
  1052. switch (hparams.n_layer) {
  1053. case 16: type = LLM_TYPE_1B; break;
  1054. case 32:
  1055. if (hparams.n_embd == 2048) {
  1056. type = LLM_TYPE_2B;
  1057. } else if (hparams.n_embd == 4096) {
  1058. type = LLM_TYPE_8B;
  1059. }
  1060. break;
  1061. default: type = LLM_TYPE_UNKNOWN;
  1062. }
  1063. // Load attention parameters
  1064. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
  1065. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
  1066. } break;
  1067. case LLM_ARCH_GPT2:
  1068. {
  1069. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1070. switch (hparams.n_layer) {
  1071. case 12: type = LLM_TYPE_SMALL; break;
  1072. case 24: type = LLM_TYPE_MEDIUM; break;
  1073. case 36: type = LLM_TYPE_LARGE; break;
  1074. case 48: type = LLM_TYPE_XL; break;
  1075. default: type = LLM_TYPE_UNKNOWN;
  1076. }
  1077. } break;
  1078. case LLM_ARCH_CODESHELL:
  1079. {
  1080. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1081. switch (hparams.n_layer) {
  1082. case 42: type = LLM_TYPE_7B; break;
  1083. default: type = LLM_TYPE_UNKNOWN;
  1084. }
  1085. } break;
  1086. case LLM_ARCH_ORION:
  1087. {
  1088. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1089. switch (hparams.n_layer) {
  1090. case 40: type = LLM_TYPE_14B; break;
  1091. default: type = LLM_TYPE_UNKNOWN;
  1092. }
  1093. } break;
  1094. case LLM_ARCH_INTERNLM2:
  1095. {
  1096. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1097. switch (hparams.n_layer) {
  1098. case 32: type = LLM_TYPE_7B; break;
  1099. case 48: type = LLM_TYPE_20B; break;
  1100. default: type = LLM_TYPE_UNKNOWN;
  1101. }
  1102. } break;
  1103. case LLM_ARCH_GEMMA:
  1104. {
  1105. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1106. switch (hparams.n_layer) {
  1107. case 18: type = LLM_TYPE_2B; break;
  1108. case 28: type = LLM_TYPE_7B; break;
  1109. default: type = LLM_TYPE_UNKNOWN;
  1110. }
  1111. } break;
  1112. case LLM_ARCH_GEMMA2:
  1113. {
  1114. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1115. hparams.n_swa = 4096; // default value of gemma 2
  1116. hparams.set_swa_pattern(2);
  1117. hparams.attn_soft_cap = true;
  1118. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1119. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1120. ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
  1121. ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
  1122. switch (hparams.n_layer) {
  1123. case 26: type = LLM_TYPE_2B; break;
  1124. case 42: type = LLM_TYPE_9B; break;
  1125. case 46: type = LLM_TYPE_27B; break;
  1126. default: type = LLM_TYPE_UNKNOWN;
  1127. }
  1128. // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
  1129. hparams.f_attention_scale = type == LLM_TYPE_27B
  1130. ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
  1131. : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
  1132. } break;
  1133. case LLM_ARCH_GEMMA3:
  1134. {
  1135. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1136. hparams.set_swa_pattern(6);
  1137. hparams.rope_freq_base_train_swa = 10000.0f;
  1138. hparams.rope_freq_scale_train_swa = 1.0f;
  1139. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1140. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1141. switch (hparams.n_layer) {
  1142. case 18: type = LLM_TYPE_270M; break;
  1143. case 26: type = LLM_TYPE_1B; break;
  1144. case 34: type = LLM_TYPE_4B; break;
  1145. case 48: type = LLM_TYPE_12B; break;
  1146. case 62: type = LLM_TYPE_27B; break;
  1147. default: type = LLM_TYPE_UNKNOWN;
  1148. }
  1149. // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
  1150. hparams.f_attention_scale = type == LLM_TYPE_27B
  1151. ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
  1152. : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
  1153. } break;
  1154. case LLM_ARCH_GEMMA3N:
  1155. {
  1156. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1157. hparams.set_swa_pattern(5);
  1158. hparams.n_layer_kv_from_start = 20;
  1159. hparams.rope_freq_base_train_swa = 10000.0f;
  1160. hparams.rope_freq_scale_train_swa = 1.0f;
  1161. hparams.f_attention_scale = 1.0f;
  1162. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1163. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1164. switch (hparams.n_layer) {
  1165. case 30: type = LLM_TYPE_E2B; break;
  1166. case 35: type = LLM_TYPE_E4B; break;
  1167. default: type = LLM_TYPE_UNKNOWN;
  1168. }
  1169. } break;
  1170. case LLM_ARCH_GEMMA_EMBEDDING:
  1171. {
  1172. hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
  1173. hparams.set_swa_pattern(6);
  1174. hparams.causal_attn = false; // embeddings do not use causal attention
  1175. hparams.rope_freq_base_train_swa = 10000.0f;
  1176. hparams.rope_freq_scale_train_swa = 1.0f;
  1177. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1178. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1179. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  1180. //applied only if model converted with --sentence-transformers-dense-modules
  1181. ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
  1182. ml.get_key(LLM_KV_DENSE_2_FEAT_OUT, hparams.dense_2_feat_out, false);
  1183. ml.get_key(LLM_KV_DENSE_3_FEAT_IN, hparams.dense_3_feat_in, false);
  1184. ml.get_key(LLM_KV_DENSE_3_FEAT_OUT, hparams.dense_3_feat_out, false);
  1185. GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd");
  1186. GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd");
  1187. switch (hparams.n_layer) {
  1188. case 24: type = LLM_TYPE_0_3B; break;
  1189. default: type = LLM_TYPE_UNKNOWN;
  1190. }
  1191. hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
  1192. } break;
  1193. case LLM_ARCH_STARCODER2:
  1194. {
  1195. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1196. switch (hparams.n_layer) {
  1197. case 30: type = LLM_TYPE_3B; break;
  1198. case 32: type = LLM_TYPE_7B; break;
  1199. case 40: type = LLM_TYPE_15B; break;
  1200. case 52: type = LLM_TYPE_20B; break; // granite
  1201. case 88: type = LLM_TYPE_34B; break; // granite
  1202. default: type = LLM_TYPE_UNKNOWN;
  1203. }
  1204. } break;
  1205. case LLM_ARCH_MAMBA:
  1206. {
  1207. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1208. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1209. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1210. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1211. ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
  1212. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1213. switch (hparams.n_layer) {
  1214. case 24:
  1215. switch (hparams.n_embd) {
  1216. case 768: type = LLM_TYPE_SMALL; break;
  1217. default: type = LLM_TYPE_UNKNOWN;
  1218. } break;
  1219. case 48:
  1220. switch (hparams.n_embd) {
  1221. case 1024: type = LLM_TYPE_MEDIUM; break;
  1222. case 1536: type = LLM_TYPE_LARGE; break;
  1223. case 2048: type = LLM_TYPE_XL; break;
  1224. default: type = LLM_TYPE_UNKNOWN;
  1225. } break;
  1226. case 64:
  1227. switch (hparams.n_embd) {
  1228. case 2560: type = LLM_TYPE_3B; break;
  1229. default: type = LLM_TYPE_UNKNOWN;
  1230. } break;
  1231. default: type = LLM_TYPE_UNKNOWN;
  1232. }
  1233. } break;
  1234. case LLM_ARCH_MAMBA2:
  1235. {
  1236. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1237. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1238. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1239. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1240. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1241. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1242. switch (hparams.n_layer) {
  1243. case 24:
  1244. switch (hparams.n_embd) {
  1245. case 768: type = LLM_TYPE_SMALL; break;
  1246. default: type = LLM_TYPE_UNKNOWN;
  1247. } break;
  1248. case 48:
  1249. switch (hparams.n_embd) {
  1250. case 1024: type = LLM_TYPE_MEDIUM; break;
  1251. case 1536: type = LLM_TYPE_LARGE; break;
  1252. case 2048: type = LLM_TYPE_XL; break;
  1253. default: type = LLM_TYPE_UNKNOWN;
  1254. } break;
  1255. case 64:
  1256. switch (hparams.n_embd) {
  1257. case 2560: type = LLM_TYPE_3B; break;
  1258. case 4096: type = LLM_TYPE_7B; break;
  1259. default: type = LLM_TYPE_UNKNOWN;
  1260. } break;
  1261. default: type = LLM_TYPE_UNKNOWN;
  1262. }
  1263. } break;
  1264. case LLM_ARCH_JAMBA:
  1265. {
  1266. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1267. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1268. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1269. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1270. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1271. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1272. hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
  1273. }
  1274. switch (hparams.n_layer) {
  1275. // TODO: Jamba layers are a bit heterogenous, so naming this is hard.
  1276. case 12: // 900M 8x???M
  1277. case 32: // 51B 16x?B
  1278. default: type = LLM_TYPE_UNKNOWN;
  1279. }
  1280. } break;
  1281. case LLM_ARCH_XVERSE:
  1282. {
  1283. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1284. switch (hparams.n_layer) {
  1285. case 32: type = LLM_TYPE_7B; break;
  1286. case 40: type = LLM_TYPE_13B; break;
  1287. case 80: type = LLM_TYPE_65B; break;
  1288. default: type = LLM_TYPE_UNKNOWN;
  1289. }
  1290. } break;
  1291. case LLM_ARCH_COMMAND_R:
  1292. {
  1293. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1294. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1295. switch (hparams.n_layer) {
  1296. case 40: type = LLM_TYPE_35B; break;
  1297. default: type = LLM_TYPE_UNKNOWN;
  1298. }
  1299. } break;
  1300. case LLM_ARCH_COHERE2:
  1301. {
  1302. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1303. hparams.set_swa_pattern(4);
  1304. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1305. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1306. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1307. switch (hparams.n_layer) {
  1308. case 32: type = LLM_TYPE_8B; break;
  1309. default: type = LLM_TYPE_UNKNOWN;
  1310. }
  1311. } break;
  1312. case LLM_ARCH_DBRX:
  1313. {
  1314. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1315. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
  1316. switch (hparams.n_layer) {
  1317. case 40: type = LLM_TYPE_16x12B; break;
  1318. default: type = LLM_TYPE_UNKNOWN;
  1319. }
  1320. } break;
  1321. case LLM_ARCH_OLMO:
  1322. {
  1323. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1324. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  1325. switch (hparams.n_layer) {
  1326. case 22: type = LLM_TYPE_1B; break;
  1327. case 32: type = LLM_TYPE_7B; break;
  1328. case 80: type = LLM_TYPE_70B; break;
  1329. default: type = LLM_TYPE_UNKNOWN;
  1330. }
  1331. } break;
  1332. case LLM_ARCH_OLMO2:
  1333. {
  1334. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1335. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1336. if (found_swa && hparams.n_swa > 0) {
  1337. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1338. hparams.set_swa_pattern(4);
  1339. } else {
  1340. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  1341. }
  1342. switch (hparams.n_layer) {
  1343. case 16: type = LLM_TYPE_1B; break;
  1344. case 32: type = LLM_TYPE_7B; break;
  1345. case 40: type = LLM_TYPE_13B; break;
  1346. case 64: type = LLM_TYPE_32B; break;
  1347. default: type = LLM_TYPE_UNKNOWN;
  1348. }
  1349. } break;
  1350. case LLM_ARCH_SEED_OSS:
  1351. {
  1352. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1353. switch (hparams.n_layer) {
  1354. case 64: type = LLM_TYPE_36B; break;
  1355. default: type = LLM_TYPE_UNKNOWN;
  1356. }
  1357. } break;
  1358. case LLM_ARCH_OLMOE:
  1359. {
  1360. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1361. switch (hparams.n_layer) {
  1362. case 16: type = LLM_TYPE_A1_7B; break;
  1363. default: type = LLM_TYPE_UNKNOWN;
  1364. }
  1365. } break;
  1366. case LLM_ARCH_OPENELM:
  1367. {
  1368. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1369. switch (hparams.n_layer) {
  1370. case 16: type = LLM_TYPE_270M; break;
  1371. case 20: type = LLM_TYPE_450M; break;
  1372. case 28: type = LLM_TYPE_1B; break;
  1373. case 36: type = LLM_TYPE_3B; break;
  1374. default: type = LLM_TYPE_UNKNOWN;
  1375. }
  1376. } break;
  1377. case LLM_ARCH_GPTNEOX:
  1378. {
  1379. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1380. ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
  1381. switch (hparams.n_layer) {
  1382. case 6:
  1383. switch (hparams.n_ff()) {
  1384. case 512: type = LLM_TYPE_14M; break;
  1385. case 2048: type = LLM_TYPE_70M; break;
  1386. default: type = LLM_TYPE_UNKNOWN;
  1387. } break;
  1388. case 12:
  1389. switch (hparams.n_ff()) {
  1390. case 3072: type = LLM_TYPE_160M; break;
  1391. default: type = LLM_TYPE_UNKNOWN;
  1392. } break;
  1393. case 16:
  1394. switch (hparams.n_ff()) {
  1395. case 8192: type = LLM_TYPE_1B; break;
  1396. default: type = LLM_TYPE_UNKNOWN;
  1397. } break;
  1398. case 24:
  1399. switch (hparams.n_ff()) {
  1400. case 4096: type = LLM_TYPE_410M; break;
  1401. case 8192: type = LLM_TYPE_1_4B; break;
  1402. default: type = LLM_TYPE_UNKNOWN;
  1403. } break;
  1404. case 32:
  1405. switch (hparams.n_ff()) {
  1406. case 10240: type = LLM_TYPE_2_8B; break;
  1407. case 16384: type = LLM_TYPE_6_9B; break;
  1408. default: type = LLM_TYPE_UNKNOWN;
  1409. } break;
  1410. case 36:
  1411. switch (hparams.n_ff()) {
  1412. case 20480: type = LLM_TYPE_12B; break;
  1413. default: type = LLM_TYPE_UNKNOWN;
  1414. } break;
  1415. case 44:
  1416. switch (hparams.n_ff()) {
  1417. case 24576: type = LLM_TYPE_20B; break;
  1418. default: type = LLM_TYPE_UNKNOWN;
  1419. } break;
  1420. default: type = LLM_TYPE_UNKNOWN;
  1421. }
  1422. } break;
  1423. case LLM_ARCH_ARCTIC:
  1424. {
  1425. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1426. if (hparams.n_expert == 128) {
  1427. switch (hparams.n_layer) {
  1428. case 35: type = LLM_TYPE_10B_128x3_66B; break;
  1429. default: type = LLM_TYPE_UNKNOWN;
  1430. }
  1431. } else {
  1432. type = LLM_TYPE_UNKNOWN;
  1433. }
  1434. } break;
  1435. case LLM_ARCH_DEEPSEEK:
  1436. {
  1437. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1438. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1439. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1440. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1441. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1442. switch (hparams.n_layer) {
  1443. case 28: type = LLM_TYPE_20B; break;
  1444. default: type = LLM_TYPE_UNKNOWN;
  1445. }
  1446. } break;
  1447. case LLM_ARCH_DEEPSEEK2:
  1448. {
  1449. // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
  1450. bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
  1451. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1452. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1453. if (!is_lite) {
  1454. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  1455. }
  1456. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  1457. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
  1458. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
  1459. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1460. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1461. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1462. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1463. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1464. if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
  1465. // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
  1466. // that have no expert_gating_func model parameter set
  1467. hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
  1468. }
  1469. ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
  1470. switch (hparams.n_layer) {
  1471. case 27: type = LLM_TYPE_16B; break;
  1472. case 60: type = LLM_TYPE_236B; break;
  1473. case 61: type = LLM_TYPE_671B; break;
  1474. default: type = LLM_TYPE_UNKNOWN;
  1475. }
  1476. } break;
  1477. case LLM_ARCH_PLM:
  1478. {
  1479. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1480. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  1481. switch (hparams.n_layer) {
  1482. case 32: type = LLM_TYPE_1_8B; break;
  1483. default: type = LLM_TYPE_UNKNOWN;
  1484. }
  1485. } break;
  1486. case LLM_ARCH_CHATGLM:
  1487. {
  1488. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1489. switch (hparams.n_layer) {
  1490. case 28: {
  1491. if (hparams.n_head(0) == 16) {
  1492. type = LLM_TYPE_1_5B;
  1493. } else {
  1494. type = LLM_TYPE_6B;
  1495. }
  1496. } break;
  1497. case 40: {
  1498. if (hparams.n_head(0) == 24) {
  1499. type = LLM_TYPE_4B;
  1500. } else {
  1501. type = LLM_TYPE_9B;
  1502. }
  1503. } break;
  1504. default: type = LLM_TYPE_UNKNOWN;
  1505. }
  1506. } break;
  1507. case LLM_ARCH_GLM4:
  1508. {
  1509. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1510. switch (hparams.n_layer) {
  1511. case 40: type = LLM_TYPE_9B; break;
  1512. case 61: type = LLM_TYPE_32B; break;
  1513. default: type = LLM_TYPE_UNKNOWN;
  1514. }
  1515. } break;
  1516. case LLM_ARCH_GLM4_MOE:
  1517. {
  1518. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1519. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1520. // MoE parameters
  1521. ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
  1522. ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
  1523. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1524. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
  1525. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1526. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1527. // Expert gating function (GLM-4.5 uses sigmoid)
  1528. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1529. if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
  1530. hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
  1531. }
  1532. // NextN/MTP parameters
  1533. ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
  1534. // TODO: when MTP is implemented, this should probably be updated if needed
  1535. hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
  1536. switch (hparams.n_layer) {
  1537. case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
  1538. case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
  1539. default: type = LLM_TYPE_UNKNOWN;
  1540. }
  1541. } break;
  1542. case LLM_ARCH_BITNET:
  1543. {
  1544. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1545. switch (hparams.n_layer) {
  1546. case 26: type = LLM_TYPE_3B; break;
  1547. default: type = LLM_TYPE_UNKNOWN;
  1548. }
  1549. } break;
  1550. case LLM_ARCH_T5:
  1551. {
  1552. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1553. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  1554. uint32_t dec_start_token_id;
  1555. if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
  1556. hparams.dec_start_token_id = dec_start_token_id;
  1557. }
  1558. hparams.dec_n_layer = hparams.n_layer;
  1559. ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false);
  1560. switch (hparams.n_layer) {
  1561. case 6: type = LLM_TYPE_60M; break; // t5-small
  1562. case 8: type = LLM_TYPE_80M; break; // flan-t5-small
  1563. case 12:
  1564. switch (hparams.n_ff()) {
  1565. case 3072: type = LLM_TYPE_220M; break; // t5-base
  1566. case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
  1567. default: type = LLM_TYPE_UNKNOWN;
  1568. } break;
  1569. case 24:
  1570. switch (hparams.n_ff()) {
  1571. case 4096: type = LLM_TYPE_770M; break; // t5-large
  1572. case 2816: type = LLM_TYPE_780M; break; // flan-t5-large
  1573. case 16384: type = LLM_TYPE_3B; break; // t5-3b
  1574. case 5120: type = LLM_TYPE_3B; break; // flan-t5-xl
  1575. case 65536: type = LLM_TYPE_11B; break; // t5-11b
  1576. case 10240: type = LLM_TYPE_11B; break; // flan-t5-xxl
  1577. default: type = LLM_TYPE_UNKNOWN;
  1578. } break;
  1579. default: type = LLM_TYPE_UNKNOWN;
  1580. }
  1581. } break;
  1582. case LLM_ARCH_T5ENCODER:
  1583. {
  1584. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1585. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  1586. type = LLM_TYPE_UNKNOWN;
  1587. } break;
  1588. case LLM_ARCH_JAIS:
  1589. {
  1590. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1591. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  1592. switch (hparams.n_layer) {
  1593. case 24: type = LLM_TYPE_1_3B; break;
  1594. case 40: type = LLM_TYPE_13B; break;
  1595. /* TODO: add variants */
  1596. default: type = LLM_TYPE_UNKNOWN;
  1597. }
  1598. } break;
  1599. case LLM_ARCH_NEMOTRON:
  1600. {
  1601. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1602. switch (hparams.n_layer) {
  1603. case 32: type = LLM_TYPE_4B; break;
  1604. default: type = LLM_TYPE_UNKNOWN;
  1605. }
  1606. } break;
  1607. case LLM_ARCH_NEMOTRON_H:
  1608. {
  1609. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1610. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1611. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1612. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1613. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1614. // A layer is recurrent IFF the n_head_kv value is set to 0 and
  1615. // the n_ff value is set to 0
  1616. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1617. hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
  1618. }
  1619. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1620. switch (hparams.n_layer) {
  1621. case 56: type = LLM_TYPE_9B; break;
  1622. default: type = LLM_TYPE_UNKNOWN;
  1623. }
  1624. } break;
  1625. case LLM_ARCH_EXAONE:
  1626. {
  1627. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1628. switch (hparams.n_layer) {
  1629. case 32: type = LLM_TYPE_8B; break;
  1630. default: type = LLM_TYPE_UNKNOWN;
  1631. }
  1632. } break;
  1633. case LLM_ARCH_EXAONE4:
  1634. {
  1635. if (hparams.n_layer == 64) { // 32B
  1636. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1637. hparams.n_swa = 4096;
  1638. hparams.set_swa_pattern(4);
  1639. }
  1640. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1641. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1642. switch (hparams.n_layer) {
  1643. case 30: type = LLM_TYPE_1_2B; break;
  1644. case 64: type = LLM_TYPE_32B; break;
  1645. default: type = LLM_TYPE_UNKNOWN;
  1646. }
  1647. } break;
  1648. case LLM_ARCH_RWKV6:
  1649. case LLM_ARCH_RWKV6QWEN2:
  1650. {
  1651. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
  1652. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
  1653. ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
  1654. ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
  1655. ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
  1656. ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
  1657. ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
  1658. switch (hparams.n_layer) {
  1659. case 24: type = LLM_TYPE_1_6B; break;
  1660. case 32:
  1661. switch (hparams.n_embd) {
  1662. case 2560: type = LLM_TYPE_3B; break;
  1663. case 4096: type = LLM_TYPE_7B; break;
  1664. default: type = LLM_TYPE_UNKNOWN;
  1665. } break;
  1666. case 61: type = LLM_TYPE_14B; break;
  1667. case 64: type = LLM_TYPE_32B; break;
  1668. default: type = LLM_TYPE_UNKNOWN;
  1669. }
  1670. } break;
  1671. case LLM_ARCH_RWKV7:
  1672. case LLM_ARCH_ARWKV7:
  1673. {
  1674. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
  1675. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
  1676. ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
  1677. ml.get_key(LLM_KV_ATTENTION_DECAY_LORA_RANK, hparams.n_lora_decay);
  1678. ml.get_key(LLM_KV_ATTENTION_ICLR_LORA_RANK, hparams.n_lora_iclr);
  1679. ml.get_key(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix);
  1680. ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate, false);
  1681. ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
  1682. switch (hparams.n_layer) {
  1683. case 12:
  1684. switch (hparams.n_embd) {
  1685. case 768: type = LLM_TYPE_190M; break;
  1686. default: type = LLM_TYPE_UNKNOWN;
  1687. } break;
  1688. case 24:
  1689. switch (hparams.n_embd) {
  1690. case 1024: type = LLM_TYPE_450M; break;
  1691. case 2048: type = LLM_TYPE_1_5B; break;
  1692. default: type = LLM_TYPE_UNKNOWN;
  1693. } break;
  1694. case 28:
  1695. switch (hparams.n_embd) {
  1696. case 1536: type = LLM_TYPE_1_5B; break;
  1697. case 3584: type = LLM_TYPE_7B; break;
  1698. default: type = LLM_TYPE_UNKNOWN;
  1699. } break;
  1700. case 32:
  1701. switch (hparams.n_embd) {
  1702. case 2560: type = LLM_TYPE_2_9B; break;
  1703. case 4096: type = LLM_TYPE_7B; break;
  1704. default: type = LLM_TYPE_UNKNOWN;
  1705. } break;
  1706. case 61:
  1707. switch (hparams.n_embd) {
  1708. case 4096: type = LLM_TYPE_14B; break;
  1709. default: type = LLM_TYPE_UNKNOWN;
  1710. } break;
  1711. default: type = LLM_TYPE_UNKNOWN;
  1712. }
  1713. } break;
  1714. case LLM_ARCH_GRANITE:
  1715. case LLM_ARCH_GRANITE_MOE:
  1716. {
  1717. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1718. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1719. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  1720. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  1721. ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
  1722. // Granite uses rope_finetuned as a switch for rope, so default to true
  1723. bool rope_finetuned = true;
  1724. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  1725. hparams.rope_finetuned = rope_finetuned;
  1726. switch (hparams.n_layer) {
  1727. case 32: type = LLM_TYPE_3B; break;
  1728. case 40: type = LLM_TYPE_3B; break;
  1729. // Add additional layer/vocab/etc checks here for other model sizes
  1730. default: type = LLM_TYPE_UNKNOWN;
  1731. }
  1732. // For Granite MoE Shared
  1733. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
  1734. } break;
  1735. case LLM_ARCH_GRANITE_HYBRID:
  1736. {
  1737. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1738. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /* required */ false);
  1739. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /* required */ false);
  1740. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /* required */ false);
  1741. ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, /* required */ false);
  1742. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1743. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1744. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1745. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1746. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1747. // Granite uses rope_finetuned as a switch for rope, so default to true
  1748. bool rope_finetuned = true;
  1749. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  1750. hparams.rope_finetuned = rope_finetuned;
  1751. // A layer is recurrent IFF the n_head_kv value is set to 0
  1752. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1753. hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
  1754. }
  1755. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1756. switch (hparams.n_embd) {
  1757. case 768: type = LLM_TYPE_350M; break;
  1758. case 1536: type = (hparams.n_embd == 2048 ? LLM_TYPE_7B_A1B : LLM_TYPE_1B); break;
  1759. case 2048: case 2560: type = LLM_TYPE_3B; break;
  1760. case 4096: type = LLM_TYPE_32B; break;
  1761. default: type = LLM_TYPE_UNKNOWN;
  1762. }
  1763. // For Granite MoE Shared
  1764. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
  1765. } break;
  1766. case LLM_ARCH_CHAMELEON:
  1767. {
  1768. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1769. hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
  1770. ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
  1771. switch (hparams.n_layer) {
  1772. case 32: type = LLM_TYPE_7B; break;
  1773. case 48: type = LLM_TYPE_34B; break;
  1774. default: type = LLM_TYPE_UNKNOWN;
  1775. }
  1776. } break;
  1777. case LLM_ARCH_WAVTOKENIZER_DEC:
  1778. {
  1779. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1780. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
  1781. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
  1782. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  1783. } break;
  1784. case LLM_ARCH_BAILINGMOE:
  1785. {
  1786. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1787. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1788. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1789. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1790. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1791. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1792. switch (hparams.n_layer) {
  1793. case 28: type = LLM_TYPE_16B; break;
  1794. case 88: type = LLM_TYPE_290B; break;
  1795. default: type = LLM_TYPE_UNKNOWN;
  1796. }
  1797. } break;
  1798. case LLM_ARCH_BAILINGMOE2:
  1799. {
  1800. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1801. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1802. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1803. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
  1804. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1805. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1806. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1807. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
  1808. ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
  1809. // TODO: when MTP is implemented, this should probably be updated if needed
  1810. hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
  1811. switch (hparams.n_layer) {
  1812. case 20: type = LLM_TYPE_16B_A1B; break;
  1813. case 21: type = LLM_TYPE_16B_A1B; break;
  1814. case 32: type = LLM_TYPE_100B_A6B; break;
  1815. case 33: type = LLM_TYPE_100B_A6B; break;
  1816. default: type = LLM_TYPE_UNKNOWN;
  1817. }
  1818. } break;
  1819. case LLM_ARCH_DOTS1:
  1820. {
  1821. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1822. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1823. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1824. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1825. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1826. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1827. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1828. switch (hparams.n_layer) {
  1829. case 62: type = LLM_TYPE_142B; break;
  1830. default: type = LLM_TYPE_UNKNOWN;
  1831. }
  1832. } break;
  1833. case LLM_ARCH_ERNIE4_5:
  1834. case LLM_ARCH_ERNIE4_5_MOE:
  1835. {
  1836. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1837. if (arch == LLM_ARCH_ERNIE4_5_MOE) {
  1838. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1839. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  1840. ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
  1841. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1842. }
  1843. switch (hparams.n_layer) {
  1844. case 18: type = LLM_TYPE_0_3B; break;
  1845. case 28: type = LLM_TYPE_21B_A3B; break;
  1846. case 54: type = LLM_TYPE_300B_A47B; break;
  1847. default: type = LLM_TYPE_UNKNOWN;
  1848. }
  1849. } break;
  1850. case LLM_ARCH_FALCON_H1:
  1851. {
  1852. // Common parameters
  1853. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1854. // SSM parameters
  1855. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1856. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1857. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1858. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1859. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1860. std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
  1861. switch (hparams.n_layer) {
  1862. case 36:
  1863. type = LLM_TYPE_0_5B; break;
  1864. case 24:
  1865. type = LLM_TYPE_1_5B; break;
  1866. case 66:
  1867. type = LLM_TYPE_1B; break;
  1868. case 32:
  1869. type = LLM_TYPE_3B; break;
  1870. case 44:
  1871. type = LLM_TYPE_7B; break;
  1872. case 72:
  1873. type = LLM_TYPE_34B; break;
  1874. default:
  1875. type = LLM_TYPE_UNKNOWN;
  1876. }
  1877. } break;
  1878. case LLM_ARCH_HUNYUAN_MOE:
  1879. {
  1880. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1881. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1882. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
  1883. switch (hparams.n_layer) {
  1884. case 32: type = LLM_TYPE_A13B; break;
  1885. default: type = LLM_TYPE_UNKNOWN;
  1886. }
  1887. } break;
  1888. case LLM_ARCH_HUNYUAN_DENSE:
  1889. {
  1890. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1891. switch (hparams.n_embd) {
  1892. case 1024: type = LLM_TYPE_0_5B; break;
  1893. case 2048: type = LLM_TYPE_1_8B; break;
  1894. case 3072: type = LLM_TYPE_4B; break;
  1895. case 4096: type = LLM_TYPE_7B; break;
  1896. default: type = LLM_TYPE_UNKNOWN;
  1897. }
  1898. } break;
  1899. case LLM_ARCH_SMOLLM3:
  1900. {
  1901. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1902. hparams.n_no_rope_layer_step = 4;
  1903. switch (hparams.n_layer) {
  1904. case 36: type = LLM_TYPE_3B; break;
  1905. default: type = LLM_TYPE_UNKNOWN;
  1906. }
  1907. } break;
  1908. case LLM_ARCH_OPENAI_MOE:
  1909. {
  1910. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1911. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1912. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1913. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1914. hparams.set_swa_pattern(2);
  1915. switch (hparams.n_layer) {
  1916. case 24: type = LLM_TYPE_20B; break;
  1917. case 36: type = LLM_TYPE_120B; break;
  1918. default: type = LLM_TYPE_UNKNOWN;
  1919. }
  1920. } break;
  1921. case LLM_ARCH_LFM2:
  1922. {
  1923. ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
  1924. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1925. for (uint32_t il = 0; il < hparams.n_layer; ++il) {
  1926. hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
  1927. }
  1928. hparams.n_layer_dense_lead = hparams.n_layer;
  1929. switch (hparams.n_ff()) {
  1930. case 4608: type = LLM_TYPE_350M; break;
  1931. case 6912: type = LLM_TYPE_700M; break;
  1932. case 8192: type = LLM_TYPE_1_2B; break;
  1933. case 10752: type = LLM_TYPE_2_6B; break;
  1934. default: type = LLM_TYPE_UNKNOWN;
  1935. }
  1936. } break;
  1937. case LLM_ARCH_LFM2MOE:
  1938. {
  1939. ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
  1940. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1941. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1942. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1943. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
  1944. for (uint32_t il = 0; il < hparams.n_layer; ++il) {
  1945. hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
  1946. }
  1947. type = LLM_TYPE_8B_A1B;
  1948. } break;
  1949. case LLM_ARCH_SMALLTHINKER:
  1950. {
  1951. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1952. if (found_swa && hparams.n_swa > 0) {
  1953. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1954. hparams.n_swa = 4096;
  1955. hparams.set_swa_pattern(4, true);
  1956. } else {
  1957. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  1958. hparams.n_no_rope_layer_step = hparams.n_layer;
  1959. }
  1960. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  1961. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1962. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1963. switch (hparams.n_layer) {
  1964. case 32: type = LLM_TYPE_4B; break;
  1965. case 52: type = LLM_TYPE_20B; break;
  1966. default: type = LLM_TYPE_UNKNOWN;
  1967. }
  1968. } break;
  1969. case LLM_ARCH_GROVEMOE:
  1970. {
  1971. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1972. ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, hparams.n_ff_chexp);
  1973. ml.get_key(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale);
  1974. ml.get_key(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts);
  1975. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1976. switch (hparams.n_layer) {
  1977. case 48: type = LLM_TYPE_30B_A3B; break;
  1978. default: type = LLM_TYPE_UNKNOWN;
  1979. }
  1980. } break;
  1981. case LLM_ARCH_APERTUS:
  1982. {
  1983. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1984. ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer);
  1985. ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer);
  1986. ml.get_key_or_arr(LLM_KV_XIELU_BETA, hparams.xielu_beta, hparams.n_layer);
  1987. ml.get_key_or_arr(LLM_KV_XIELU_EPS, hparams.xielu_eps, hparams.n_layer);
  1988. switch (hparams.n_layer) {
  1989. case 32: type = LLM_TYPE_8B; break;
  1990. default: type = LLM_TYPE_UNKNOWN;
  1991. }
  1992. } break;
  1993. case LLM_ARCH_MINIMAX_M2:
  1994. {
  1995. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1996. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1997. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1998. switch (hparams.n_layer) {
  1999. case 62: type = LLM_TYPE_230B_A10B; break;
  2000. default: type = LLM_TYPE_UNKNOWN;
  2001. }
  2002. } break;
  2003. case LLM_ARCH_COGVLM:
  2004. {
  2005. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  2006. switch (hparams.n_layer) {
  2007. case 32: type = LLM_TYPE_13B; break;
  2008. default: type = LLM_TYPE_UNKNOWN;
  2009. }
  2010. } break;
  2011. case LLM_ARCH_PANGU_EMBED:
  2012. {
  2013. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  2014. switch (hparams.n_layer) {
  2015. case 26: type = LLM_TYPE_1B; break; // openPangu-Embedded-1B-V1.1
  2016. case 34: type = LLM_TYPE_7B; break; // openPangu-Embedded-7B-V1.1
  2017. default: type = LLM_TYPE_UNKNOWN;
  2018. }
  2019. } break;
  2020. default: throw std::runtime_error("unsupported model architecture");
  2021. }
  2022. pimpl->n_bytes = ml.n_bytes;
  2023. pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
  2024. if (hparams.f_max_alibi_bias > 0.0f) {
  2025. hparams.use_alibi = true;
  2026. }
  2027. hparams.rope_type = llama_model_rope_type(this);
  2028. }
  2029. void llama_model::load_vocab(llama_model_loader & ml) {
  2030. const auto kv = LLM_KV(arch);
  2031. vocab.load(ml, kv);
  2032. }
  2033. bool llama_model::load_tensors(llama_model_loader & ml) {
  2034. const auto & split_mode = params.split_mode;
  2035. const auto & n_gpu_layers = params.n_gpu_layers;
  2036. const auto & use_mlock = params.use_mlock;
  2037. const auto & tensor_split = params.tensor_split;
  2038. const int n_layer = hparams.n_layer;
  2039. const bool use_mmap_buffer = true;
  2040. LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
  2041. // build a list of buffer types for the CPU and GPU devices
  2042. pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
  2043. for (auto * dev : devices) {
  2044. buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
  2045. // add CPU buffer types as a fallback
  2046. buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
  2047. pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
  2048. }
  2049. // calculate the split points
  2050. bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
  2051. std::vector<float> splits(n_devices());
  2052. if (all_zero) {
  2053. // default split, by free memory
  2054. for (size_t i = 0; i < n_devices(); ++i) {
  2055. ggml_backend_dev_t dev = devices[i];
  2056. size_t total;
  2057. size_t free;
  2058. ggml_backend_dev_memory(dev, &free, &total);
  2059. splits[i] = free;
  2060. }
  2061. } else {
  2062. std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
  2063. }
  2064. // sum and normalize the splits to get the split points
  2065. float split_sum = 0.0f;
  2066. for (size_t i = 0; i < n_devices(); ++i) {
  2067. split_sum += splits[i];
  2068. splits[i] = split_sum;
  2069. }
  2070. for (size_t i = 0; i < n_devices(); ++i) {
  2071. splits[i] /= split_sum;
  2072. }
  2073. ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  2074. if (cpu_dev == nullptr) {
  2075. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  2076. }
  2077. const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
  2078. const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
  2079. auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
  2080. const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
  2081. if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
  2082. LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
  2083. return {cpu_dev, &pimpl->cpu_buft_list};
  2084. }
  2085. const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
  2086. auto * dev = devices.at(layer_gpu);
  2087. LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
  2088. return {dev, &pimpl->gpu_buft_list.at(dev)};
  2089. };
  2090. // assign the input layer
  2091. // there is very little benefit to offloading the input layer, so always keep it on the CPU
  2092. pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
  2093. // assign the repeating layers to the devices according to the splits
  2094. pimpl->dev_layer.resize(n_layer);
  2095. for (int il = 0; il < n_layer; ++il) {
  2096. pimpl->dev_layer[il] = get_layer_buft_list(il);
  2097. }
  2098. // assign the output layer
  2099. pimpl->dev_output = get_layer_buft_list(n_layer);
  2100. // one ggml context per buffer type
  2101. int max_n_tensors = ml.n_tensors;
  2102. max_n_tensors += 1; // duplicated output tensor
  2103. max_n_tensors += n_layer*2; // duplicated rope freq tensors
  2104. const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
  2105. // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
  2106. struct ggml_backend_buft_comparator {
  2107. bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
  2108. return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
  2109. }
  2110. };
  2111. std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
  2112. auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
  2113. auto it = ctx_map.find(buft);
  2114. if (it == ctx_map.end()) {
  2115. ggml_init_params params = {
  2116. /*.mem_size =*/ ctx_size,
  2117. /*.mem_buffer =*/ NULL,
  2118. /*.no_alloc =*/ true,
  2119. };
  2120. ggml_context * ctx = ggml_init(params);
  2121. if (!ctx) {
  2122. throw std::runtime_error(format("failed to create ggml context"));
  2123. }
  2124. ctx_map.emplace(buft, ctx);
  2125. return ctx;
  2126. }
  2127. return it->second.get();
  2128. };
  2129. const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
  2130. const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
  2131. const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
  2132. // create tensors for the weights
  2133. {
  2134. // note: cast to int64_t since we will use these for the tensor dimensions
  2135. const int64_t n_head = hparams.n_head();
  2136. const int64_t n_head_kv = hparams.n_head_kv();
  2137. const int64_t n_embd = hparams.n_embd;
  2138. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
  2139. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
  2140. const int64_t n_embd_head_k = hparams.n_embd_head_k;
  2141. const int64_t n_embd_head_v = hparams.n_embd_head_v;
  2142. const int64_t n_ff = hparams.n_ff();
  2143. const int64_t n_embd_gqa = n_embd_v_gqa;
  2144. const int64_t n_vocab = vocab.n_tokens();
  2145. const int64_t n_token_types = vocab.n_token_types();
  2146. const int64_t n_rot = hparams.n_rot;
  2147. const int64_t n_expert = hparams.n_expert;
  2148. const int64_t n_expert_used = hparams.n_expert_used;
  2149. const int64_t n_ctx_train = hparams.n_ctx_train;
  2150. if (n_expert > 0 && hparams.n_expert_used == 0) {
  2151. throw std::runtime_error("model has expert layers but no expert layers are used");
  2152. }
  2153. int n_moved_tensors = 0;
  2154. ggml_tensor * first_moved_tensor = nullptr;
  2155. ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
  2156. ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
  2157. auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
  2158. ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
  2159. if (!t_meta) {
  2160. if (flags & TENSOR_NOT_REQUIRED) {
  2161. return nullptr;
  2162. }
  2163. throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
  2164. }
  2165. // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
  2166. // the tensor is duplicated
  2167. // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
  2168. llm_tensor tn_tensor = tn.tensor;
  2169. if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
  2170. tn_tensor = LLM_TENSOR_OUTPUT;
  2171. }
  2172. llm_tensor_info info;
  2173. try {
  2174. info = llm_tensor_info_for(tn_tensor);
  2175. } catch (const std::out_of_range & e) {
  2176. throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
  2177. }
  2178. // skip unused tensors
  2179. if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
  2180. const size_t nbytes = ggml_nbytes(t_meta);
  2181. LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
  2182. ml.size_data -= nbytes;
  2183. ml.n_created++;
  2184. return nullptr;
  2185. }
  2186. // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
  2187. ggml_op op;
  2188. bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
  2189. if (bias) {
  2190. if (info.op == GGML_OP_MUL_MAT_ID) {
  2191. op = GGML_OP_ADD_ID;
  2192. } else {
  2193. op = GGML_OP_ADD;
  2194. }
  2195. } else {
  2196. op = info.op;
  2197. }
  2198. // sanity checks
  2199. if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
  2200. if (tn.bid != -1) {
  2201. GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
  2202. }
  2203. } else {
  2204. if (tn.bid == -1) {
  2205. GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
  2206. }
  2207. }
  2208. // select the buffer type for this tensor
  2209. buft_list_t * buft_list;
  2210. switch (info.layer) {
  2211. case LLM_TENSOR_LAYER_INPUT:
  2212. buft_list = pimpl->dev_input.buft_list;
  2213. break;
  2214. case LLM_TENSOR_LAYER_OUTPUT:
  2215. buft_list = pimpl->dev_output.buft_list;
  2216. break;
  2217. case LLM_TENSOR_LAYER_REPEATING:
  2218. buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
  2219. break;
  2220. default:
  2221. GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
  2222. }
  2223. ggml_backend_buffer_type_t buft = nullptr;
  2224. // check overrides
  2225. if (ml.tensor_buft_overrides) {
  2226. std::string tensor_name = tn.str();
  2227. for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
  2228. std::regex pattern(overrides->pattern);
  2229. if (std::regex_search(tensor_name, pattern)) {
  2230. if (overrides->buft == ggml_backend_cpu_buffer_type()) {
  2231. // when overriding to a CPU buffer, consider the extra buffer types
  2232. buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
  2233. } else {
  2234. buft = overrides->buft;
  2235. }
  2236. LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
  2237. tensor_name.c_str(),
  2238. ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
  2239. ggml_backend_buft_name(buft));
  2240. break;
  2241. }
  2242. }
  2243. }
  2244. if (!buft) {
  2245. buft = select_weight_buft(hparams, t_meta, op, *buft_list);
  2246. if (!buft) {
  2247. throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
  2248. }
  2249. }
  2250. // avoid using a host buffer when using mmap
  2251. auto * buft_dev = ggml_backend_buft_get_device(buft);
  2252. if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
  2253. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  2254. if (!cpu_dev) {
  2255. throw std::runtime_error("no CPU backend found");
  2256. }
  2257. buft = ggml_backend_dev_buffer_type(cpu_dev);
  2258. }
  2259. if (buft != buft_list->front().second) {
  2260. n_moved_tensors++;
  2261. if (!first_moved_tensor) {
  2262. first_moved_tensor = t_meta;
  2263. first_moved_from_buft = buft_list->front().second;
  2264. first_moved_to_buft = buft;
  2265. }
  2266. }
  2267. ggml_context * ctx = ctx_for_buft(buft);
  2268. // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
  2269. if (flags & TENSOR_DUPLICATED) {
  2270. ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
  2271. if (t) {
  2272. return t;
  2273. }
  2274. }
  2275. return ml.create_tensor(ctx, tn, ne, flags);
  2276. };
  2277. layers.resize(n_layer);
  2278. // TODO: move to a separate function
  2279. const auto tn = LLM_TN(arch);
  2280. switch (arch) {
  2281. case LLM_ARCH_LLAMA:
  2282. case LLM_ARCH_REFACT:
  2283. case LLM_ARCH_MINICPM:
  2284. case LLM_ARCH_GRANITE:
  2285. case LLM_ARCH_GRANITE_MOE:
  2286. {
  2287. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2288. // output
  2289. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2290. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2291. // if output is NULL, init from the input tok embed
  2292. if (output == NULL) {
  2293. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2294. }
  2295. for (int i = 0; i < n_layer; ++i) {
  2296. auto & layer = layers[i];
  2297. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2298. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2299. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2300. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2301. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2302. // optional bias tensors
  2303. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2304. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2305. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2306. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2307. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2308. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  2309. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2310. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2311. }
  2312. else {
  2313. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2314. }
  2315. if (n_expert == 0) {
  2316. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2317. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2318. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2319. // optional MLP bias
  2320. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2321. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2322. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2323. } else {
  2324. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2325. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
  2326. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  2327. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2328. // For Granite MoE Shared
  2329. if (hparams.n_ff_shexp > 0) {
  2330. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  2331. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  2332. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
  2333. }
  2334. }
  2335. }
  2336. } break;
  2337. case LLM_ARCH_LLADA:
  2338. {
  2339. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2340. // output
  2341. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2342. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
  2343. // if output is NULL, init from the input tok embed
  2344. if (output == NULL) {
  2345. output =
  2346. create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
  2347. }
  2348. for (int i = 0; i < n_layer; ++i) {
  2349. auto & layer = layers[i];
  2350. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  2351. // Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
  2352. layer.wq =
  2353. create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
  2354. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
  2355. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
  2356. // No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
  2357. layer.wo =
  2358. create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
  2359. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
  2360. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  2361. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
  2362. TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2363. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
  2364. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  2365. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
  2366. // optional MLP bias
  2367. layer.ffn_gate_b =
  2368. create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
  2369. layer.ffn_down_b =
  2370. create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
  2371. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
  2372. }
  2373. }
  2374. break;
  2375. case LLM_ARCH_LLADA_MOE:
  2376. {
  2377. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2378. // output
  2379. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2380. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2381. GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe");
  2382. GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe");
  2383. for (int i = 0; i < n_layer; ++i) {
  2384. auto & layer = layers[i];
  2385. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2386. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2387. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2388. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2389. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2390. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2391. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2392. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2393. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2394. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2395. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2396. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2397. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2398. }
  2399. } break;
  2400. case LLM_ARCH_LLAMA4:
  2401. {
  2402. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2403. // output
  2404. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2405. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2406. // if output is NULL, init from the input tok embed
  2407. if (output == NULL) {
  2408. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2409. }
  2410. for (int i = 0; i < n_layer; ++i) {
  2411. bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
  2412. auto & layer = layers[i];
  2413. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2414. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2415. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2416. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2417. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2418. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2419. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2420. if (is_moe_layer) {
  2421. int n_ff_exp = hparams.n_ff_exp;
  2422. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2423. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  2424. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
  2425. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  2426. // Shared expert
  2427. const int64_t n_ff_shexp = n_ff_exp;
  2428. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2429. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
  2430. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2431. } else {
  2432. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2433. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2434. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2435. }
  2436. }
  2437. } break;
  2438. case LLM_ARCH_DECI:
  2439. {
  2440. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2441. // output
  2442. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2443. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2444. // if output is NULL, init from the input tok embed
  2445. if (output == NULL) {
  2446. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2447. }
  2448. for (int i = 0; i < n_layer; ++i) {
  2449. auto & layer = layers[i];
  2450. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
  2451. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
  2452. const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
  2453. const int64_t n_ff = hparams.n_ff(i);
  2454. const int64_t n_head = hparams.n_head(i);
  2455. const int64_t n_head_kv = hparams.n_head_kv(i);
  2456. if (n_head_kv == 0 && n_head > 0) {
  2457. // linear attention for DeciLMCausalModel
  2458. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2459. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2460. }
  2461. else if (n_head_kv > 0) {
  2462. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2463. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2464. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2465. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2466. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2467. }
  2468. // optional bias tensors
  2469. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2470. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2471. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2472. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2473. if (n_ff > 0) {
  2474. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2475. }
  2476. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  2477. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2478. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2479. }
  2480. else {
  2481. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2482. }
  2483. if (n_ff > 0) {
  2484. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2485. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2486. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2487. }
  2488. // optional MLP bias
  2489. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2490. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2491. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2492. }
  2493. } break;
  2494. case LLM_ARCH_MINICPM3:
  2495. {
  2496. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  2497. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  2498. const int64_t q_lora_rank = hparams.n_lora_q;
  2499. const int64_t kv_lora_rank = hparams.n_lora_kv;
  2500. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2501. // output
  2502. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2503. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2504. // if output is NULL, init from the input tok embed
  2505. if (output == NULL) {
  2506. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2507. }
  2508. for (int i = 0; i < n_layer; ++i) {
  2509. auto & layer = layers[i];
  2510. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2511. layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
  2512. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  2513. layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
  2514. layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
  2515. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
  2516. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
  2517. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
  2518. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2519. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2520. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2521. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2522. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2523. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2524. }
  2525. } break;
  2526. case LLM_ARCH_GROK:
  2527. {
  2528. if (n_expert == 0) {
  2529. throw std::runtime_error("Grok model cannot have zero experts");
  2530. }
  2531. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2532. // output
  2533. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2534. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2535. // if output is NULL, init from the input tok embed
  2536. if (output == NULL) {
  2537. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2538. }
  2539. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff/* / n_expert_used*/; // grok-1 n_ff_exp == n_ff
  2540. for (int i = 0; i < n_layer; ++i) {
  2541. auto & layer = layers[i];
  2542. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2543. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2544. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2545. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2546. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2547. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  2548. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2549. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2550. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, TENSOR_NOT_REQUIRED);
  2551. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2552. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2553. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
  2554. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2555. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  2556. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2557. if (!layer.ffn_post_norm) {
  2558. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  2559. }
  2560. }
  2561. } break;
  2562. case LLM_ARCH_DBRX:
  2563. {
  2564. if (n_expert == 0) {
  2565. throw std::runtime_error("DBRX model cannot have zero experts");
  2566. }
  2567. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2568. // output
  2569. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2570. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2571. for (int i = 0; i < n_layer; ++i) {
  2572. auto & layer = layers[i];
  2573. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2574. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2575. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2576. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  2577. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2578. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2579. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  2580. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2581. }
  2582. } break;
  2583. case LLM_ARCH_BAICHUAN:
  2584. {
  2585. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2586. {
  2587. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2588. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2589. }
  2590. for (int i = 0; i < n_layer; ++i) {
  2591. auto & layer = layers[i];
  2592. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2593. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2594. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2595. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2596. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2597. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2598. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2599. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2600. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2601. }
  2602. } break;
  2603. case LLM_ARCH_FALCON:
  2604. {
  2605. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2606. // output
  2607. {
  2608. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2609. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2610. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2611. if (!output) {
  2612. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
  2613. }
  2614. }
  2615. for (int i = 0; i < n_layer; ++i) {
  2616. auto & layer = layers[i];
  2617. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2618. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2619. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2620. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2621. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2622. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2623. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2624. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2625. }
  2626. } break;
  2627. case LLM_ARCH_STARCODER:
  2628. {
  2629. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2630. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  2631. // output
  2632. {
  2633. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2634. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2635. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2636. if (!output) {
  2637. // needs to be on GPU
  2638. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2639. }
  2640. }
  2641. for (int i = 0; i < n_layer; ++i) {
  2642. auto & layer = layers[i];
  2643. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2644. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2645. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2646. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2647. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2648. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2649. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2650. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2651. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2652. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2653. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2654. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2655. }
  2656. } break;
  2657. case LLM_ARCH_BERT:
  2658. case LLM_ARCH_NOMIC_BERT:
  2659. case LLM_ARCH_NOMIC_BERT_MOE:
  2660. case LLM_ARCH_JINA_BERT_V3:
  2661. {
  2662. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2663. type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
  2664. if (arch == LLM_ARCH_BERT) {
  2665. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  2666. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
  2667. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  2668. cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2669. cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2670. }
  2671. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  2672. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  2673. for (int i = 0; i < n_layer; ++i) {
  2674. auto & layer = layers[i];
  2675. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2676. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2677. if (!layer.wqkv) {
  2678. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2679. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2680. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2681. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2682. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2683. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2684. }
  2685. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2686. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2687. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  2688. layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
  2689. if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
  2690. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
  2691. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  2692. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2693. } else {
  2694. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2695. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2696. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2697. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2698. if (arch == LLM_ARCH_NOMIC_BERT) {
  2699. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2700. }
  2701. }
  2702. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  2703. layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
  2704. }
  2705. } break;
  2706. case LLM_ARCH_NEO_BERT:
  2707. {
  2708. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2709. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
  2710. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  2711. cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2712. cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2713. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2714. for (int i = 0; i < n_layer; ++i) {
  2715. auto & layer = layers[i];
  2716. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2717. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2718. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2719. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2720. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff*2}, 0);
  2721. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2722. }
  2723. } break;
  2724. case LLM_ARCH_JINA_BERT_V2:
  2725. {
  2726. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
  2727. type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
  2728. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
  2729. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); //LayerNorm bias
  2730. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
  2731. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {1}, TENSOR_NOT_REQUIRED);
  2732. for (int i = 0; i < n_layer; ++i) {
  2733. auto & layer = layers[i]; // JinaBertLayer
  2734. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2735. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2736. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2737. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2738. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2739. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2740. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2741. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2742. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2743. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2744. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
  2745. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
  2746. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
  2747. layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
  2748. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2749. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2750. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2751. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
  2752. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2753. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2754. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  2755. layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
  2756. }
  2757. } break;
  2758. case LLM_ARCH_BLOOM:
  2759. {
  2760. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2761. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  2762. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  2763. // output
  2764. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2765. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2766. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2767. // if output is NULL, init from the input tok embed
  2768. if (output == NULL) {
  2769. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2770. }
  2771. for (int i = 0; i < n_layer; ++i) {
  2772. auto & layer = layers[i];
  2773. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2774. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2775. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2776. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2777. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2778. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2779. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2780. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2781. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2782. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2783. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2784. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2785. }
  2786. } break;
  2787. case LLM_ARCH_MPT:
  2788. {
  2789. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2790. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
  2791. // output
  2792. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2793. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  2794. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2795. if (!output) {
  2796. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
  2797. }
  2798. for (int i = 0; i < n_layer; ++i) {
  2799. auto & layer = layers[i];
  2800. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2801. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2802. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2803. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2804. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2805. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2806. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2807. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2808. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2809. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2810. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2811. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2812. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2813. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2814. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2815. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2816. // AWQ ScaleActivation layer
  2817. layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2818. }
  2819. } break;
  2820. case LLM_ARCH_STABLELM:
  2821. {
  2822. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2823. // output
  2824. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2825. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2826. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2827. for (int i = 0; i < n_layer; ++i) {
  2828. auto & layer = layers[i];
  2829. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2830. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2831. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2832. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2833. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2834. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2835. // optional bias tensors, present in Stable LM 2 1.6B
  2836. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2837. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2838. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2839. // optional q and k layernorms, present in StableLM 2 12B
  2840. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
  2841. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
  2842. // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
  2843. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2844. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2845. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2846. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2847. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2848. }
  2849. } break;
  2850. case LLM_ARCH_QWEN:
  2851. {
  2852. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2853. // output
  2854. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2855. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2856. for (int i = 0; i < n_layer; ++i) {
  2857. auto & layer = layers[i];
  2858. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2859. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
  2860. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
  2861. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2862. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2863. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
  2864. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
  2865. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}, 0);
  2866. }
  2867. } break;
  2868. case LLM_ARCH_QWEN2:
  2869. case LLM_ARCH_QWEN2VL:
  2870. case LLM_ARCH_DREAM:
  2871. {
  2872. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2873. // output
  2874. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2875. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2876. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, TENSOR_NOT_REQUIRED);
  2877. // if output is NULL, init from the input tok embed
  2878. if (output == NULL) {
  2879. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2880. }
  2881. for (int i = 0; i < n_layer; ++i) {
  2882. auto & layer = layers[i];
  2883. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2884. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2885. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2886. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2887. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2888. // optional bias tensors
  2889. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2890. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2891. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2892. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2893. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2894. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2895. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2896. }
  2897. } break;
  2898. case LLM_ARCH_QWEN2MOE:
  2899. {
  2900. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2901. // output
  2902. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2903. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2904. for (int i = 0; i < n_layer; ++i) {
  2905. auto & layer = layers[i];
  2906. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2907. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2908. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2909. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2910. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2911. // optional bias tensors
  2912. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2913. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2914. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2915. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2916. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2917. if (n_expert == 0) {
  2918. throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
  2919. }
  2920. if (n_expert_used == 0) {
  2921. throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
  2922. }
  2923. // MoE branch
  2924. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2925. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2926. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2927. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2928. // Shared expert branch
  2929. const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
  2930. layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
  2931. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2932. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
  2933. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2934. }
  2935. } break;
  2936. case LLM_ARCH_QWEN3:
  2937. case LLM_ARCH_QWEN3VL:
  2938. {
  2939. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2940. // output
  2941. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2942. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2943. // if output is NULL, init from the input tok embed
  2944. if (output == NULL) {
  2945. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2946. }
  2947. // output rerank head
  2948. cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2949. for (int i = 0; i < n_layer; ++i) {
  2950. auto & layer = layers[i];
  2951. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2952. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2953. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2954. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2955. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2956. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2957. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2958. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2959. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2960. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2961. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2962. }
  2963. } break;
  2964. case LLM_ARCH_QWEN3MOE:
  2965. case LLM_ARCH_QWEN3VLMOE:
  2966. case LLM_ARCH_RND1:
  2967. {
  2968. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2969. // output
  2970. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2971. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2972. // if output is NULL, init from the input tok embed
  2973. if (output == NULL) {
  2974. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2975. }
  2976. for (int i = 0; i < n_layer; ++i) {
  2977. auto & layer = layers[i];
  2978. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2979. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2980. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2981. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2982. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2983. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2984. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2985. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2986. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2987. if (n_expert == 0) {
  2988. throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
  2989. }
  2990. if (n_expert_used == 0) {
  2991. throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
  2992. }
  2993. // MoE branch
  2994. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2995. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2996. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2997. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2998. }
  2999. } break;
  3000. case LLM_ARCH_PHI2:
  3001. {
  3002. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3003. // output
  3004. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3005. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3006. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3007. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, 0);
  3008. for (int i = 0; i < n_layer; ++i) {
  3009. auto & layer = layers[i];
  3010. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3011. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3012. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3013. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3014. if (layer.wqkv == nullptr) {
  3015. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3016. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  3017. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3018. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  3019. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3020. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  3021. }
  3022. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3023. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3024. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3025. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3026. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3027. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  3028. }
  3029. } break;
  3030. case LLM_ARCH_PHI3:
  3031. {
  3032. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  3033. // output
  3034. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  3035. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3036. // if output is NULL, init from the input tok embed
  3037. if (output == NULL) {
  3038. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3039. }
  3040. for (int i = 0; i < n_layer; ++i) {
  3041. auto & layer = layers[i];
  3042. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  3043. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
  3044. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  3045. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  3046. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  3047. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
  3048. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3049. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3050. }
  3051. } break;
  3052. case LLM_ARCH_PHIMOE:
  3053. {
  3054. const int64_t n_embd_head = n_embd / n_head;
  3055. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  3056. // output
  3057. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  3058. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3059. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
  3060. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0);
  3061. for (int i = 0; i < n_layer; ++i) {
  3062. auto & layer = layers[i];
  3063. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  3064. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
  3065. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
  3066. if (layer.wqkv == nullptr) {
  3067. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3068. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  3069. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3070. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  3071. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3072. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  3073. }
  3074. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  3075. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
  3076. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  3077. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
  3078. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3079. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3080. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  3081. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3082. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3083. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3084. }
  3085. } break;
  3086. case LLM_ARCH_PLAMO:
  3087. {
  3088. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3089. // output
  3090. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3091. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3092. for (int i = 0; i < n_layer; ++i) {
  3093. auto & layer = layers[i];
  3094. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3095. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3096. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3097. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3098. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3099. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3100. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3101. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3102. }
  3103. } break;
  3104. case LLM_ARCH_PLAMO2:
  3105. {
  3106. // mamba parameters
  3107. const uint32_t d_conv = hparams.ssm_d_conv;
  3108. const uint32_t d_state = hparams.ssm_d_state;
  3109. const uint32_t num_heads = hparams.ssm_dt_rank;
  3110. const uint32_t intermediate_size = hparams.ssm_d_inner;
  3111. const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
  3112. // attention parameters
  3113. const uint32_t qk_dim = hparams.n_embd_head_k;
  3114. const uint32_t v_dim = hparams.n_embd_head_v;
  3115. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3116. // output
  3117. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3118. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3119. // if output is NULL, init from the input tok embed
  3120. if (output == NULL) {
  3121. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3122. }
  3123. for (int i = 0; i < n_layer; ++i) {
  3124. auto & layer = layers[i];
  3125. bool is_mamba_layer = hparams.is_recurrent(i);
  3126. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3127. if (is_mamba_layer) {
  3128. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2 * intermediate_size}, 0);
  3129. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, intermediate_size}, 0);
  3130. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {intermediate_size, dt_dim + 2*d_state}, 0);
  3131. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_dim, num_heads}, 0);
  3132. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {num_heads}, 0);
  3133. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {num_heads}, 0);
  3134. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {num_heads}, 0);
  3135. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {intermediate_size, n_embd}, 0);
  3136. layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, i), {dt_dim}, 0);
  3137. layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
  3138. layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
  3139. } else {
  3140. const int64_t num_attention_heads = hparams.n_head(i);
  3141. const int64_t q_num_heads = num_attention_heads;
  3142. const int64_t num_key_value_heads = hparams.n_head_kv(i);
  3143. const int64_t k_num_heads = num_key_value_heads;
  3144. const int64_t v_num_heads = num_key_value_heads;
  3145. const int64_t q_proj_dim = q_num_heads * qk_dim;
  3146. const int64_t k_proj_dim = k_num_heads * qk_dim;
  3147. const int64_t v_proj_dim = v_num_heads * v_dim;
  3148. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
  3149. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {qk_dim, num_attention_heads}, 0);
  3150. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {qk_dim, k_num_heads}, 0);
  3151. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
  3152. }
  3153. // All layers have post-attention norm, FFN norm, and FFN tensors
  3154. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
  3155. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3156. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3157. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  3158. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
  3159. }
  3160. } break;
  3161. case LLM_ARCH_GPT2:
  3162. {
  3163. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3164. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  3165. // output
  3166. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3167. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3168. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3169. // if output is NULL, init from the input tok embed
  3170. if (output == NULL) {
  3171. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3172. }
  3173. for (int i = 0; i < n_layer; ++i) {
  3174. auto & layer = layers[i];
  3175. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3176. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3177. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3178. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  3179. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3180. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3181. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3182. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3183. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3184. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3185. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3186. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  3187. }
  3188. } break;
  3189. case LLM_ARCH_CODESHELL:
  3190. {
  3191. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3192. // if tok embd is NULL, init from output
  3193. if (tok_embd == NULL) {
  3194. tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3195. }
  3196. // output
  3197. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3198. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3199. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3200. for (int i = 0; i < n_layer; ++i) {
  3201. auto & layer = layers[i];
  3202. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3203. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3204. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3205. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  3206. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3207. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3208. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3209. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3210. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3211. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3212. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3213. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  3214. }
  3215. } break;
  3216. case LLM_ARCH_ORION:
  3217. {
  3218. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3219. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3220. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3221. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3222. for (int i = 0; i < n_layer; ++i) {
  3223. auto & layer = layers[i];
  3224. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3225. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3226. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3227. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3228. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3229. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3230. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3231. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3232. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3233. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3234. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3235. }
  3236. } break;
  3237. case LLM_ARCH_INTERNLM2:
  3238. {
  3239. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3240. // output
  3241. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3242. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3243. for (int i = 0; i < n_layer; ++i) {
  3244. auto & layer = layers[i];
  3245. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3246. // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3247. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3248. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3249. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3250. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3251. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3252. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3253. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3254. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3255. }
  3256. } break;
  3257. case LLM_ARCH_GEMMA:
  3258. {
  3259. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3260. // output
  3261. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3262. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  3263. for (int i = 0; i < n_layer; ++i) {
  3264. auto & layer = layers[i];
  3265. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3266. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3267. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3268. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3269. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3270. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3271. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3272. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3273. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3274. }
  3275. } break;
  3276. case LLM_ARCH_GEMMA2:
  3277. {
  3278. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3279. // output
  3280. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3281. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  3282. for (int i = 0; i < n_layer; ++i) {
  3283. auto & layer = layers[i];
  3284. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3285. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3286. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3287. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3288. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3289. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3290. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3291. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3292. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3293. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3294. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3295. }
  3296. } break;
  3297. case LLM_ARCH_GEMMA3:
  3298. case LLM_ARCH_GEMMA_EMBEDDING:
  3299. {
  3300. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3301. // output
  3302. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3303. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3304. // if output is NULL, init from the input tok embed
  3305. if (output == NULL) {
  3306. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3307. }
  3308. // Dense linear weights
  3309. dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.dense_2_feat_out}, TENSOR_NOT_REQUIRED);
  3310. dense_3_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_3_OUT, "weight"), {hparams.dense_3_feat_in, n_embd}, TENSOR_NOT_REQUIRED);
  3311. for (int i = 0; i < n_layer; ++i) {
  3312. auto & layer = layers[i];
  3313. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3314. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3315. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3316. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3317. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3318. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3319. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  3320. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  3321. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3322. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3323. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3324. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3325. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3326. }
  3327. } break;
  3328. case LLM_ARCH_GEMMA3N:
  3329. {
  3330. const int64_t n_altup = hparams.n_altup;
  3331. const int64_t laurel_rank = hparams.laurel_rank;
  3332. const int64_t n_embd_altup = hparams.n_embd_altup;
  3333. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3334. // if output is NULL, init from the input tok embed
  3335. if (output == NULL) {
  3336. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3337. }
  3338. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3339. tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0);
  3340. altup_proj = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
  3341. altup_unembd_proj = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
  3342. per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_altup * n_layer}, 0);
  3343. per_layer_proj_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM, "weight"), {n_embd_altup}, 0);
  3344. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3345. for (int i = 0; i < n_layer; ++i) {
  3346. auto & layer = layers[i];
  3347. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3348. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3349. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3350. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3351. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3352. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  3353. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  3354. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3355. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3356. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3357. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3358. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3359. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3360. // altup & laurel
  3361. layer.per_layer_inp_gate = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE, "weight", i), {n_embd, n_embd_altup}, 0);
  3362. layer.per_layer_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ, "weight", i), {n_embd_altup, n_embd}, 0);
  3363. layer.per_layer_post_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
  3364. layer.altup_correct_coef = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF, "weight", i), {n_altup, n_altup}, 0);
  3365. layer.altup_correct_scale = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0);
  3366. layer.altup_predict_coef = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF, "weight", i), {n_altup, n_altup * n_altup}, 0);
  3367. layer.altup_router = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER, "weight", i), {n_embd, n_altup}, 0);
  3368. layer.altup_router_norm = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM, "weight", i), {n_embd}, 0);
  3369. layer.laurel_l = create_tensor(tn(LLM_TENSOR_LAUREL_L, "weight", i), {n_embd, laurel_rank}, 0);
  3370. layer.laurel_r = create_tensor(tn(LLM_TENSOR_LAUREL_R, "weight", i), {laurel_rank, n_embd}, 0);
  3371. layer.laurel_post_norm = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM, "weight", i), {n_embd}, 0);
  3372. }
  3373. } break;
  3374. case LLM_ARCH_STARCODER2:
  3375. {
  3376. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3377. // output
  3378. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3379. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3380. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3381. // if output is NULL, init from the input tok embed
  3382. if (output == NULL) {
  3383. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3384. }
  3385. for (int i = 0; i < n_layer; ++i) {
  3386. auto & layer = layers[i];
  3387. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3388. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3389. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3390. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3391. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3392. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3393. // optional bias tensors
  3394. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  3395. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  3396. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  3397. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3398. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3399. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3400. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3401. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3402. // optional bias tensors
  3403. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3404. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}, 0);
  3405. }
  3406. } break;
  3407. case LLM_ARCH_MAMBA:
  3408. {
  3409. const int64_t d_conv = hparams.ssm_d_conv;
  3410. const int64_t d_inner = hparams.ssm_d_inner;
  3411. const int64_t d_state = hparams.ssm_d_state;
  3412. const int64_t dt_rank = hparams.ssm_dt_rank;
  3413. // only an expansion factor of 2 is supported for now
  3414. if (2 * n_embd != d_inner) {
  3415. throw std::runtime_error("only an expansion factor of 2 is supported for now");
  3416. }
  3417. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3418. // output
  3419. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3420. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3421. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3422. if (output == NULL) {
  3423. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3424. }
  3425. for (int i = 0; i < n_layer; ++i) {
  3426. auto & layer = layers[i];
  3427. // norm
  3428. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3429. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
  3430. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
  3431. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
  3432. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
  3433. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
  3434. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
  3435. // no "weight" suffix for these
  3436. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
  3437. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
  3438. // out_proj
  3439. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3440. }
  3441. } break;
  3442. case LLM_ARCH_MAMBA2:
  3443. {
  3444. const int64_t d_conv = hparams.ssm_d_conv;
  3445. const int64_t d_inner = hparams.ssm_d_inner;
  3446. const int64_t d_state = hparams.ssm_d_state;
  3447. const int64_t n_head = hparams.ssm_dt_rank;
  3448. const int64_t n_group = hparams.ssm_n_group;
  3449. const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
  3450. // only an expansion factor of 2 is supported for now
  3451. GGML_ASSERT(2 * n_embd == d_inner);
  3452. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3453. // output
  3454. {
  3455. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3456. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3457. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3458. if (output == NULL) {
  3459. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3460. }
  3461. }
  3462. for (int i = 0; i < n_layer; ++i) {
  3463. auto & layer = layers[i];
  3464. // norm
  3465. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3466. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
  3467. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
  3468. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);
  3469. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
  3470. // no "weight" suffix for these
  3471. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
  3472. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
  3473. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
  3474. // out_proj
  3475. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3476. }
  3477. } break;
  3478. case LLM_ARCH_JAMBA:
  3479. {
  3480. const int64_t d_conv = hparams.ssm_d_conv;
  3481. const int64_t d_inner = hparams.ssm_d_inner;
  3482. const int64_t d_state = hparams.ssm_d_state;
  3483. const int64_t dt_rank = hparams.ssm_dt_rank;
  3484. // only an expansion factor of 2 is supported for now
  3485. GGML_ASSERT(2 * n_embd == d_inner);
  3486. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3487. // output
  3488. {
  3489. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3490. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3491. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3492. if (output == NULL) {
  3493. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3494. }
  3495. }
  3496. for (int i = 0; i < n_layer; ++i) {
  3497. const int64_t n_head_kv = hparams.n_head_kv(i);
  3498. const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
  3499. auto & layer = layers[i];
  3500. // norm
  3501. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3502. if (n_head_kv == 0) {
  3503. // Mamba layer
  3504. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
  3505. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
  3506. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
  3507. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
  3508. layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, "weight", i), {dt_rank}, 0);
  3509. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
  3510. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
  3511. layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, "weight", i), {d_state}, 0);
  3512. layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, "weight", i), {d_state}, 0);
  3513. // no "weight" suffix for these
  3514. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
  3515. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
  3516. // out_proj
  3517. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3518. } else {
  3519. // Attention layers
  3520. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3521. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3522. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3523. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3524. }
  3525. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3526. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
  3527. if (layer.ffn_gate_inp) {
  3528. // MoE
  3529. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3530. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  3531. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3532. } else {
  3533. // FFN (no MoE)
  3534. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3535. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3536. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3537. }
  3538. }
  3539. } break;
  3540. case LLM_ARCH_GRANITE_HYBRID:
  3541. {
  3542. // mamba2 Mixer SSM params
  3543. // NOTE: int64_t for tensor dimensions
  3544. const int64_t d_conv = hparams.ssm_d_conv;
  3545. const int64_t d_inner = hparams.ssm_d_inner;
  3546. const int64_t d_state = hparams.ssm_d_state;
  3547. const int64_t n_ssm_head = hparams.ssm_dt_rank;
  3548. const int64_t n_group = hparams.ssm_n_group;
  3549. const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
  3550. // only an expansion factor of 2 is supported for now
  3551. GGML_ASSERT(2 * n_embd == d_inner);
  3552. // embeddings
  3553. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3554. // output
  3555. {
  3556. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3557. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3558. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3559. if (output == NULL) {
  3560. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3561. }
  3562. }
  3563. for (int i = 0; i < n_layer; ++i) {
  3564. auto & layer = layers[i];
  3565. // norm
  3566. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3567. if (hparams.is_recurrent(i)) {
  3568. // ssm layers
  3569. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
  3570. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
  3571. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
  3572. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
  3573. // no "weight" suffix for these
  3574. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
  3575. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
  3576. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
  3577. // out_proj
  3578. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3579. } else {
  3580. // attention layers (with optional bias)
  3581. const int64_t n_head_i = hparams.n_head(i);
  3582. const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
  3583. const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
  3584. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
  3585. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
  3586. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
  3587. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
  3588. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3589. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
  3590. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
  3591. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3592. }
  3593. // feed forward (w/ optional biases)
  3594. if (n_expert > 0) {
  3595. // MoE FFN
  3596. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3597. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3598. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3599. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
  3600. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  3601. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3602. // For Granite MoE Shared
  3603. if (hparams.n_ff_shexp > 0) {
  3604. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  3605. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  3606. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
  3607. }
  3608. } else {
  3609. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3610. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3611. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3612. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3613. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3614. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  3615. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3616. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  3617. }
  3618. }
  3619. } break;
  3620. case LLM_ARCH_XVERSE:
  3621. {
  3622. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3623. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3624. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3625. for (int i = 0; i < n_layer; ++i) {
  3626. auto & layer = layers[i];
  3627. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3628. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3629. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3630. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3631. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3632. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3633. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3634. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3635. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3636. }
  3637. } break;
  3638. case LLM_ARCH_COMMAND_R:
  3639. {
  3640. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3641. // output
  3642. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3643. // init output from the input tok embed
  3644. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3645. for (int i = 0; i < n_layer; ++i) {
  3646. auto & layer = layers[i];
  3647. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3648. if (n_layer >= 64){
  3649. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
  3650. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
  3651. }
  3652. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3653. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3654. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3655. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3656. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3657. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3658. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3659. }
  3660. } break;
  3661. case LLM_ARCH_COHERE2:
  3662. {
  3663. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  3664. // output
  3665. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  3666. // init output from the input tok embed
  3667. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
  3668. TENSOR_DUPLICATED);
  3669. for (int i = 0; i < n_layer; ++i) {
  3670. auto & layer = layers[i];
  3671. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  3672. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
  3673. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
  3674. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
  3675. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  3676. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
  3677. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  3678. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
  3679. }
  3680. }
  3681. break;
  3682. case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
  3683. {
  3684. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3685. // output
  3686. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3687. // if output is NULL, init from the input tok embed
  3688. if (output == NULL) {
  3689. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3690. }
  3691. for (int i = 0; i < n_layer; ++i) {
  3692. auto & layer = layers[i];
  3693. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3694. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3695. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3696. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3697. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3698. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3699. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3700. }
  3701. } break;
  3702. case LLM_ARCH_OLMO2:
  3703. {
  3704. const int64_t n_embd_head = n_embd / n_head;
  3705. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3706. // output
  3707. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3708. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3709. for (int i = 0; i < n_layer; ++i) {
  3710. auto & layer = layers[i];
  3711. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3712. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3713. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3714. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3715. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
  3716. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
  3717. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3718. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3719. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3720. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3721. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3722. }
  3723. } break;
  3724. case LLM_ARCH_SEED_OSS:
  3725. {
  3726. const uint32_t head_dim = hparams.n_embd_head_k;
  3727. const int64_t n_qo_dim = n_head * head_dim;
  3728. const int64_t n_kv_dim = n_head_kv * head_dim;
  3729. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3730. // output
  3731. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3732. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3733. // if output is NULL, init from the input tok embed
  3734. if (output == NULL) {
  3735. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3736. }
  3737. for (int i = 0; i < n_layer; ++i) {
  3738. auto & layer = layers[i];
  3739. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, 0);
  3740. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, 0);
  3741. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, 0);
  3742. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
  3743. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_qo_dim}, TENSOR_NOT_REQUIRED);
  3744. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
  3745. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
  3746. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3747. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3748. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3749. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3750. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3751. }
  3752. } break;
  3753. case LLM_ARCH_OLMOE:
  3754. {
  3755. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3756. // output
  3757. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3758. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3759. for (int i = 0; i < n_layer; ++i) {
  3760. auto & layer = layers[i];
  3761. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3762. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3763. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3764. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3765. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3766. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
  3767. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
  3768. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3769. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3770. if (n_expert == 0) {
  3771. throw std::runtime_error("n_expert must be > 0");
  3772. }
  3773. if (n_expert_used == 0) {
  3774. throw std::runtime_error("n_expert_used must be > 0");
  3775. }
  3776. // MoE branch
  3777. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3778. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  3779. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3780. }
  3781. } break;
  3782. case LLM_ARCH_OPENELM:
  3783. {
  3784. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3785. // output
  3786. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3787. // init output from the input tok embed
  3788. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3789. for (int i = 0; i < n_layer; ++i) {
  3790. const int64_t n_head = hparams.n_head(i);
  3791. const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head;
  3792. const int64_t n_ff = hparams.n_ff(i);
  3793. auto & layer = layers[i];
  3794. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3795. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
  3796. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  3797. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  3798. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
  3799. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3800. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3801. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3802. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3803. }
  3804. } break;
  3805. case LLM_ARCH_GPTNEOX:
  3806. {
  3807. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3808. // output
  3809. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3810. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3811. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3812. for (int i = 0; i < n_layer; ++i) {
  3813. auto & layer = layers[i];
  3814. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3815. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3816. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3817. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  3818. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3819. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3820. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3821. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3822. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3823. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3824. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3825. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  3826. }
  3827. } break;
  3828. case LLM_ARCH_ARCTIC:
  3829. {
  3830. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3831. // output
  3832. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3833. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3834. // if output is NULL, init from the input tok embed
  3835. if (output == NULL) {
  3836. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3837. }
  3838. for (int i = 0; i < n_layer; ++i) {
  3839. auto & layer = layers[i];
  3840. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3841. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3842. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3843. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3844. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3845. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3846. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
  3847. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
  3848. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd}, 0);
  3849. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3850. layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
  3851. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
  3852. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  3853. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3854. }
  3855. } break;
  3856. case LLM_ARCH_DEEPSEEK:
  3857. {
  3858. const int64_t n_ff_exp = hparams.n_ff_exp;
  3859. const int64_t n_expert_shared = hparams.n_expert_shared;
  3860. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3861. // output
  3862. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3863. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3864. for (int i = 0; i < n_layer; ++i) {
  3865. auto & layer = layers[i];
  3866. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3867. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3868. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3869. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3870. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3871. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3872. if (i < (int) hparams.n_layer_dense_lead) {
  3873. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3874. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3875. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3876. } else {
  3877. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3878. if (n_expert == 0) {
  3879. throw std::runtime_error("n_expert must be > 0");
  3880. }
  3881. if (n_expert_used == 0) {
  3882. throw std::runtime_error("n_expert_used must be > 0");
  3883. }
  3884. // MoE branch
  3885. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3886. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  3887. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3888. // Shared expert branch
  3889. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3890. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  3891. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3892. }
  3893. }
  3894. } break;
  3895. case LLM_ARCH_DEEPSEEK2:
  3896. {
  3897. // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
  3898. const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
  3899. const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
  3900. // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
  3901. const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
  3902. const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
  3903. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  3904. const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
  3905. const int64_t q_lora_rank = hparams.n_lora_q;
  3906. const int64_t kv_lora_rank = hparams.n_lora_kv;
  3907. const int64_t n_ff_exp = hparams.n_ff_exp;
  3908. const int64_t n_expert_shared = hparams.n_expert_shared;
  3909. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3910. // output
  3911. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3912. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3913. for (int i = 0; i < n_layer; ++i) {
  3914. auto & layer = layers[i];
  3915. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3916. if (!is_lite) {
  3917. layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
  3918. }
  3919. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  3920. if (!is_lite) {
  3921. layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
  3922. layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
  3923. } else {
  3924. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
  3925. }
  3926. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
  3927. // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
  3928. if (is_mla) {
  3929. layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
  3930. layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
  3931. } else {
  3932. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
  3933. }
  3934. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
  3935. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3936. if (i < (int) hparams.n_layer_dense_lead) {
  3937. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3938. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3939. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3940. } else {
  3941. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3942. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  3943. if (n_expert == 0) {
  3944. throw std::runtime_error("n_expert must be > 0");
  3945. }
  3946. if (n_expert_used == 0) {
  3947. throw std::runtime_error("n_expert_used must be > 0");
  3948. }
  3949. // MoE branch
  3950. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3951. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  3952. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3953. // Shared expert branch
  3954. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3955. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  3956. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3957. }
  3958. }
  3959. } break;
  3960. case LLM_ARCH_PLM:
  3961. {
  3962. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  3963. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  3964. const int64_t kv_lora_rank = hparams.n_lora_kv;
  3965. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3966. // output
  3967. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3968. // output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3969. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3970. for (int i = 0; i < n_layer; ++i) {
  3971. auto & layer = layers[i];
  3972. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3973. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3974. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
  3975. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  3976. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
  3977. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
  3978. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3979. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3980. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3981. }
  3982. } break;
  3983. case LLM_ARCH_BITNET:
  3984. {
  3985. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3986. // output
  3987. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3988. for (int i = 0; i < n_layer; ++i) {
  3989. auto & layer = layers[i];
  3990. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3991. layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
  3992. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3993. layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3994. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3995. layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3996. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3997. layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3998. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3999. layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  4000. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4001. layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
  4002. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4003. layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  4004. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  4005. layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  4006. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4007. layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  4008. }
  4009. } break;
  4010. case LLM_ARCH_T5:
  4011. {
  4012. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  4013. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4014. // output
  4015. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4016. output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4017. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4018. // if output is NULL, init from the input tok embed
  4019. if (output == NULL) {
  4020. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4021. }
  4022. // n_layer: number of encoder_layers
  4023. // dec_n_layer: number of decoder_layers
  4024. const int dec_n_layer = hparams.dec_n_layer;
  4025. if (dec_n_layer > n_layer) {
  4026. layers.resize(dec_n_layer);
  4027. }
  4028. // load encoder layers
  4029. for (int i = 0; i < n_layer; ++i) {
  4030. auto & layer = layers[i];
  4031. layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
  4032. layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  4033. layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4034. layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4035. layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4036. layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  4037. layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
  4038. layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  4039. layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4040. layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4041. }
  4042. // load decoder layers
  4043. for (int i = 0; i < dec_n_layer; ++i) {
  4044. auto & layer = layers[i];
  4045. layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
  4046. layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  4047. layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4048. layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4049. layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4050. layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  4051. layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
  4052. // this tensor seems to be unused in HF transformers implementation
  4053. layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  4054. layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4055. layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4056. layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4057. layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  4058. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
  4059. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  4060. layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4061. layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4062. }
  4063. } break;
  4064. case LLM_ARCH_T5ENCODER:
  4065. {
  4066. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  4067. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4068. // output
  4069. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4070. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4071. // if output is NULL, init from the input tok embed
  4072. if (output == NULL) {
  4073. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4074. }
  4075. for (int i = 0; i < n_layer; ++i) {
  4076. auto & layer = layers[i];
  4077. layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
  4078. layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  4079. layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4080. layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4081. layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4082. layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  4083. layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
  4084. layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  4085. layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4086. layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4087. }
  4088. } break;
  4089. case LLM_ARCH_JAIS:
  4090. {
  4091. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4092. // output
  4093. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4094. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4095. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4096. for (int i = 0; i < n_layer; ++i) {
  4097. auto & layer = layers[i];
  4098. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4099. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  4100. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  4101. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  4102. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4103. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  4104. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4105. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  4106. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  4107. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  4108. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4109. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, 0);
  4110. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4111. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  4112. }
  4113. } break;
  4114. case LLM_ARCH_CHATGLM:
  4115. {
  4116. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4117. // output
  4118. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4119. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4120. // if output is NULL, init from the input tok embed
  4121. if (output == NULL) {
  4122. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4123. }
  4124. for (int i = 0; i < n_layer; ++i) {
  4125. auto & layer = layers[i];
  4126. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4127. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4128. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4129. if (layer.wqkv == nullptr) {
  4130. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4131. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4132. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4133. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4134. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4135. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4136. }
  4137. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4138. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4139. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  4140. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  4141. }
  4142. } break;
  4143. case LLM_ARCH_GLM4:
  4144. {
  4145. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4146. // output
  4147. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4148. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4149. // if output is NULL, init from the input tok embed
  4150. if (output == NULL) {
  4151. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4152. }
  4153. for (int i = 0; i < n_layer; ++i) {
  4154. auto & layer = layers[i];
  4155. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4156. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4157. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4158. if (layer.wqkv == nullptr) {
  4159. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4160. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4161. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4162. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4163. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4164. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4165. }
  4166. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4167. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  4168. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4169. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4170. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  4171. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  4172. }
  4173. } break;
  4174. case LLM_ARCH_GLM4_MOE:
  4175. {
  4176. const int64_t n_expert = hparams.n_expert;
  4177. const int64_t n_expert_used = hparams.n_expert_used;
  4178. const int64_t n_expert_shared = hparams.n_expert_shared;
  4179. GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
  4180. GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
  4181. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  4182. // output
  4183. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  4184. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
  4185. // if output is NULL, init from the input tok embed
  4186. if (output == NULL) {
  4187. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
  4188. }
  4189. // Load ALL tensors including NextN layer to satisfy total tensor count
  4190. // but only PROCESS up to last layer (skipping final NextN layer) in forward pass
  4191. for (int i = 0; i < n_layer; ++i) {
  4192. int flags = 0;
  4193. if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
  4194. // skip all tensors in the NextN layers
  4195. flags |= TENSOR_SKIP;
  4196. }
  4197. auto & layer = layers[i];
  4198. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
  4199. // GLM-style attention with bias terms
  4200. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
  4201. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
  4202. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
  4203. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
  4204. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
  4205. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
  4206. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
  4207. // K/Q norm tensors (optional for GLM-4.5 355B variant)
  4208. layer.attn_q_norm = create_tensor(
  4209. tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
  4210. layer.attn_k_norm = create_tensor(
  4211. tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
  4212. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
  4213. // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
  4214. // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
  4215. const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
  4216. if (use_moe) {
  4217. // MoE layers
  4218. layer.ffn_gate_inp =
  4219. create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
  4220. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
  4221. // MoE branch
  4222. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  4223. layer.ffn_gate_exps = create_tensor(
  4224. tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
  4225. layer.ffn_down_exps = create_tensor(
  4226. tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
  4227. layer.ffn_up_exps = create_tensor(
  4228. tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
  4229. // Shared expert
  4230. if (n_expert_shared > 0) {
  4231. const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
  4232. layer.ffn_gate_shexp = create_tensor(
  4233. tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
  4234. layer.ffn_down_shexp = create_tensor(
  4235. tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
  4236. layer.ffn_up_shexp = create_tensor(
  4237. tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
  4238. }
  4239. } else {
  4240. // Dense layers (first k layers) - GLM uses separate gate/up projections
  4241. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
  4242. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
  4243. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags);
  4244. }
  4245. // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
  4246. if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
  4247. layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
  4248. layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
  4249. layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
  4250. // Optional tensors
  4251. layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
  4252. layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
  4253. layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
  4254. }
  4255. }
  4256. }
  4257. break;
  4258. case LLM_ARCH_NEMOTRON:
  4259. {
  4260. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4261. // output
  4262. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4263. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4264. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4265. for (int i = 0; i < n_layer; ++i) {
  4266. auto & layer = layers[i];
  4267. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4268. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  4269. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  4270. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  4271. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  4272. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4273. // optional bias tensors
  4274. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4275. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4276. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4277. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4278. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4279. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  4280. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4281. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4282. // optional MLP bias
  4283. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4284. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  4285. }
  4286. } break;
  4287. case LLM_ARCH_NEMOTRON_H:
  4288. {
  4289. // mamba2 Mixer SSM params
  4290. // NOTE: int64_t for tensor dimensions
  4291. const int64_t d_conv = hparams.ssm_d_conv;
  4292. const int64_t d_inner = hparams.ssm_d_inner;
  4293. const int64_t d_state = hparams.ssm_d_state;
  4294. const int64_t n_ssm_head = hparams.ssm_dt_rank;
  4295. const int64_t n_group = hparams.ssm_n_group;
  4296. const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
  4297. // embeddings
  4298. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4299. // output
  4300. {
  4301. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4302. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4303. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  4304. if (output == NULL) {
  4305. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4306. }
  4307. }
  4308. for (int i = 0; i < n_layer; ++i) {
  4309. auto & layer = layers[i];
  4310. // all blocks use the attn norm
  4311. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4312. if (hparams.is_recurrent(i)) {
  4313. // ssm layers
  4314. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
  4315. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
  4316. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
  4317. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
  4318. // no "weight" suffix for these
  4319. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
  4320. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
  4321. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
  4322. // out_proj
  4323. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  4324. } else if (hparams.n_ff(i) == 0) {
  4325. // attention layers (with optional bias)
  4326. const int64_t n_head_i = hparams.n_head(i);
  4327. const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
  4328. const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
  4329. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
  4330. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
  4331. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
  4332. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
  4333. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4334. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
  4335. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
  4336. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4337. } else {
  4338. // mlp layers
  4339. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
  4340. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
  4341. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4342. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
  4343. }
  4344. }
  4345. } break;
  4346. case LLM_ARCH_EXAONE:
  4347. {
  4348. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4349. // output
  4350. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4351. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4352. // if output is NULL, init from the input tok embed
  4353. if (output == NULL) {
  4354. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4355. }
  4356. for (int i = 0; i < n_layer; ++i) {
  4357. auto & layer = layers[i];
  4358. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4359. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4360. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4361. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4362. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4363. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4364. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4365. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4366. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4367. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4368. }
  4369. } break;
  4370. case LLM_ARCH_EXAONE4:
  4371. {
  4372. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4373. // output
  4374. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4375. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4376. // if output is NULL, init from the input tok embed
  4377. if (output == NULL) {
  4378. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4379. }
  4380. for (int i = 0; i < n_layer; ++i) {
  4381. auto & layer = layers[i];
  4382. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4383. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4384. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4385. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4386. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4387. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  4388. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4389. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4390. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4391. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4392. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4393. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  4394. }
  4395. } break;
  4396. case LLM_ARCH_RWKV6:
  4397. {
  4398. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4399. // Block 0, LN0
  4400. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  4401. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  4402. // output
  4403. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4404. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4405. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4406. const int time_mix_extra_dim = hparams.time_mix_extra_dim;
  4407. const int time_decay_extra_dim = hparams.time_decay_extra_dim;
  4408. const int head_size = hparams.wkv_head_size;
  4409. const int attn_hidden_size = n_embd;
  4410. const int ffn_size = hparams.n_ff_arr[0];
  4411. for (int i = 0; i < n_layer; ++i) {
  4412. auto & layer = layers[i];
  4413. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4414. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  4415. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
  4416. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
  4417. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
  4418. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
  4419. layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
  4420. layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4421. layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4422. layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4423. layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4424. layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4425. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED);
  4426. GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
  4427. layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
  4428. layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
  4429. layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
  4430. layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
  4431. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  4432. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4433. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4434. layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4435. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
  4436. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
  4437. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4438. layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
  4439. layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
  4440. layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
  4441. layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
  4442. layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
  4443. }
  4444. } break;
  4445. case LLM_ARCH_RWKV6QWEN2:
  4446. {
  4447. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4448. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4449. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  4450. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4451. const int time_mix_extra_dim = hparams.time_mix_extra_dim;
  4452. const int time_decay_extra_dim = hparams.time_decay_extra_dim;
  4453. const int head_size = hparams.wkv_head_size;
  4454. const int attn_hidden_size = n_embd;
  4455. const int n_head_kv = hparams.n_head_kv();
  4456. int attn_key_value_size;
  4457. if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
  4458. attn_key_value_size = attn_hidden_size;
  4459. } else {
  4460. attn_key_value_size = n_head_kv * head_size;
  4461. }
  4462. for (int i = 0; i < n_layer; ++i) {
  4463. auto & layer = layers[i];
  4464. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4465. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
  4466. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
  4467. layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
  4468. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
  4469. layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, TENSOR_NOT_REQUIRED);
  4470. layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
  4471. layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
  4472. layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
  4473. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
  4474. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
  4475. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4476. layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4477. // optional bias tensors
  4478. layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
  4479. layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
  4480. layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, TENSOR_NOT_REQUIRED);
  4481. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4482. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4483. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4484. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4485. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4486. }
  4487. } break;
  4488. case LLM_ARCH_RWKV7:
  4489. {
  4490. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4491. // Block 0, LN0
  4492. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  4493. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  4494. // output
  4495. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4496. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4497. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4498. const int n_lora_decay = hparams.n_lora_decay;
  4499. const int n_lora_iclr = hparams.n_lora_iclr;
  4500. const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
  4501. const int n_lora_gate = hparams.n_lora_gate;
  4502. const int attn_hidden_size = n_embd;
  4503. const int ffn_size = hparams.n_ff_arr[0];
  4504. for (int i = 0; i < n_layer; ++i) {
  4505. auto & layer = layers[i];
  4506. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4507. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  4508. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
  4509. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
  4510. layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
  4511. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
  4512. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
  4513. layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
  4514. layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4515. layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4516. if (i == 0) {
  4517. // actually not used
  4518. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4519. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4520. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4521. } else {
  4522. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4523. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
  4524. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
  4525. }
  4526. layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, 0);
  4527. layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, 0);
  4528. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
  4529. layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
  4530. layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
  4531. layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
  4532. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  4533. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4534. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4535. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
  4536. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
  4537. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4538. layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
  4539. layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
  4540. layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
  4541. }
  4542. } break;
  4543. case LLM_ARCH_ARWKV7:
  4544. {
  4545. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4546. // output
  4547. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4548. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4549. const int n_lora_decay = hparams.n_lora_decay;
  4550. const int n_lora_iclr = hparams.n_lora_iclr;
  4551. const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
  4552. const int n_lora_gate = hparams.n_lora_gate;
  4553. const int attn_hidden_size = n_embd;
  4554. for (int i = 0; i < n_layer; ++i) {
  4555. auto & layer = layers[i];
  4556. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4557. layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
  4558. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
  4559. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
  4560. layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
  4561. layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4562. layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4563. if (i == 0) {
  4564. // actually not used
  4565. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4566. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4567. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4568. } else {
  4569. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4570. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
  4571. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
  4572. }
  4573. layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, TENSOR_NOT_REQUIRED);
  4574. layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, TENSOR_NOT_REQUIRED);
  4575. try {
  4576. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
  4577. } catch(std::runtime_error & e) {
  4578. // ARWKV models may not have gate tensors
  4579. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
  4580. }
  4581. layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
  4582. layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
  4583. layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
  4584. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  4585. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4586. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4587. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4588. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4589. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4590. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4591. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4592. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4593. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4594. }
  4595. } break;
  4596. case LLM_ARCH_CHAMELEON:
  4597. {
  4598. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4599. // output
  4600. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4601. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4602. // if output is NULL, init from the input tok embed
  4603. if (output == NULL) {
  4604. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4605. }
  4606. for (int i = 0; i < n_layer; ++i) {
  4607. auto & layer = layers[i];
  4608. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4609. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
  4610. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
  4611. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
  4612. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
  4613. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  4614. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  4615. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  4616. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4617. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4618. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4619. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4620. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4621. }
  4622. } break;
  4623. case LLM_ARCH_WAVTOKENIZER_DEC:
  4624. {
  4625. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
  4626. conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
  4627. conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
  4628. // posnet
  4629. {
  4630. const int64_t n_embd = hparams.posnet.n_embd;
  4631. for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
  4632. auto & layer = layers[i].posnet;
  4633. // posnet:
  4634. //
  4635. // - resnet
  4636. // - resnet
  4637. // - attn
  4638. // - resnet
  4639. // - resnet
  4640. // - norm
  4641. //
  4642. switch (i) {
  4643. case 0:
  4644. case 1:
  4645. case 3:
  4646. case 4:
  4647. {
  4648. layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
  4649. layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
  4650. layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
  4651. layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
  4652. layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
  4653. layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
  4654. layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
  4655. layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
  4656. } break;
  4657. case 2:
  4658. {
  4659. layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
  4660. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
  4661. layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
  4662. layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
  4663. layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
  4664. layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
  4665. layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
  4666. layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
  4667. layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
  4668. layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
  4669. } break;
  4670. case 5:
  4671. {
  4672. layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
  4673. layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
  4674. } break;
  4675. default: GGML_ABORT("unknown posnet layer");
  4676. };
  4677. }
  4678. }
  4679. GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
  4680. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
  4681. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
  4682. // convnext
  4683. {
  4684. const int64_t n_embd = hparams.convnext.n_embd;
  4685. for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
  4686. auto & layer = layers[i].convnext;
  4687. layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
  4688. layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
  4689. layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
  4690. layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
  4691. layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
  4692. layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
  4693. layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
  4694. layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
  4695. layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
  4696. }
  4697. // output
  4698. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4699. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4700. }
  4701. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
  4702. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
  4703. } break;
  4704. case LLM_ARCH_BAILINGMOE:
  4705. {
  4706. const int64_t n_ff_exp = hparams.n_ff_exp;
  4707. const int64_t n_expert_shared = hparams.n_expert_shared;
  4708. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4709. // output
  4710. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4711. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4712. for (int i = 0; i < n_layer; ++i) {
  4713. auto & layer = layers[i];
  4714. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4715. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
  4716. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  4717. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  4718. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
  4719. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4720. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4721. if (n_expert == 0) {
  4722. throw std::runtime_error("n_expert must be > 0");
  4723. }
  4724. if (n_expert_used == 0) {
  4725. throw std::runtime_error("n_expert_used must be > 0");
  4726. }
  4727. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4728. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  4729. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4730. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4731. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  4732. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4733. }
  4734. } break;
  4735. case LLM_ARCH_BAILINGMOE2:
  4736. {
  4737. const int64_t n_ff_exp = hparams.n_ff_exp;
  4738. const int64_t n_expert_shared = hparams.n_expert_shared;
  4739. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4740. // output
  4741. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4742. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4743. GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2");
  4744. GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2");
  4745. for (int i = 0; i < n_layer; ++i) {
  4746. int flags = 0;
  4747. if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
  4748. // skip all tensors in the NextN layers
  4749. flags |= TENSOR_SKIP;
  4750. }
  4751. auto & layer = layers[i];
  4752. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
  4753. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags);
  4754. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
  4755. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
  4756. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
  4757. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
  4758. if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
  4759. const int64_t n_ff_shexp = (hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp) * n_expert_shared;
  4760. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
  4761. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
  4762. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
  4763. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
  4764. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
  4765. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
  4766. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
  4767. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
  4768. } else { // Dense layers
  4769. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
  4770. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
  4771. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
  4772. }
  4773. // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
  4774. if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
  4775. layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
  4776. layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
  4777. layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
  4778. layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
  4779. layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
  4780. layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED | flags);
  4781. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, flags);
  4782. }
  4783. }
  4784. } break;
  4785. case LLM_ARCH_DOTS1:
  4786. {
  4787. const int64_t n_ff_exp = hparams.n_ff_exp;
  4788. const int64_t n_expert_shared = hparams.n_expert_shared;
  4789. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4790. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4791. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4792. for (int i = 0; i < n_layer; ++i) {
  4793. auto & layer = layers[i];
  4794. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4795. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4796. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4797. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4798. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4799. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4800. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4801. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4802. if (i < (int) hparams.n_layer_dense_lead) {
  4803. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4804. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4805. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4806. } else {
  4807. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4808. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  4809. if (n_expert == 0) {
  4810. throw std::runtime_error("n_expert must be > 0");
  4811. }
  4812. if (n_expert_used == 0) {
  4813. throw std::runtime_error("n_expert_used must be > 0");
  4814. }
  4815. // MoE branch
  4816. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4817. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  4818. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4819. // Shared expert branch
  4820. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4821. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  4822. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4823. }
  4824. }
  4825. } break;
  4826. case LLM_ARCH_ARCEE:
  4827. {
  4828. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4829. // output
  4830. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4831. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4832. // if output is NULL, init from the input tok embed
  4833. if (output == NULL) {
  4834. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4835. }
  4836. for (int i = 0; i < n_layer; ++i) {
  4837. auto & layer = layers[i];
  4838. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4839. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4840. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4841. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4842. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4843. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4844. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4845. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4846. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4847. }
  4848. } break;
  4849. case LLM_ARCH_AFMOE:
  4850. {
  4851. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4852. // output
  4853. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4854. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4855. // if output is NULL, init from the input tok embed
  4856. if (output == NULL) {
  4857. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4858. }
  4859. const int64_t n_ff_exp = hparams.n_ff_exp;
  4860. const int64_t n_expert_shared = hparams.n_expert_shared;
  4861. for (int i = 0; i < n_layer; ++i) {
  4862. auto & layer = layers[i];
  4863. // dual attention normalization
  4864. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4865. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  4866. // attention projections
  4867. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4868. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4869. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4870. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4871. // Q/K normalization
  4872. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4873. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4874. // attention gating
  4875. layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4876. // dual ffn normalization
  4877. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4878. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  4879. if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) {
  4880. // MoE layers
  4881. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4882. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
  4883. // grouped expert weights
  4884. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  4885. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  4886. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  4887. // shared expert
  4888. if (n_expert_shared > 0) {
  4889. const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
  4890. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
  4891. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
  4892. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
  4893. }
  4894. } else {
  4895. // Dense layers
  4896. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4897. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  4898. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4899. }
  4900. }
  4901. } break;
  4902. case LLM_ARCH_ERNIE4_5:
  4903. case LLM_ARCH_ERNIE4_5_MOE:
  4904. {
  4905. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4906. // output
  4907. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4908. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4909. // if output is NULL, init from the input tok embed
  4910. if (output == NULL) {
  4911. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4912. }
  4913. for (int i = 0; i < n_layer; ++i) {
  4914. auto & layer = layers[i];
  4915. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4916. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4917. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  4918. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  4919. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4920. // optional bias tensors
  4921. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4922. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4923. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4924. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4925. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4926. if (arch == LLM_ARCH_ERNIE4_5_MOE && static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
  4927. int n_ff_exp = hparams.n_ff_exp;
  4928. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4929. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  4930. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
  4931. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
  4932. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  4933. // Shared expert (if present)
  4934. if (hparams.n_ff_shexp > 0) {
  4935. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
  4936. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd }, 0);
  4937. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
  4938. }
  4939. } else { // Dense layers
  4940. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4941. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4942. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4943. }
  4944. }
  4945. } break;
  4946. case LLM_ARCH_FALCON_H1:
  4947. {
  4948. // Common
  4949. const int64_t hidden_size = hparams.n_embd; // hidden_size
  4950. // mamba2 Mixer SSM params
  4951. const int64_t ssm_conv_kernel_size = hparams.ssm_d_conv; // ssm_conv_kernel_size
  4952. const int64_t ssm_n_groups = hparams.ssm_n_group; // ssm_n_groups
  4953. const int64_t ssm_state_size = hparams.ssm_d_state; // ssm_state_size
  4954. const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand
  4955. const int64_t ssm_num_heads = hparams.ssm_dt_rank; // ssm_num_heads
  4956. const int64_t ssm_conv_dim = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size;
  4957. const int64_t ssm_projection_size = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads;
  4958. // attn params
  4959. const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head
  4960. const int64_t attn_num_key_value_head = hparams.n_head_kv(0);
  4961. // ffn params
  4962. const int64_t ffn_intermediate_size = hparams.n_ff(0);
  4963. // embeddings
  4964. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0);
  4965. // output
  4966. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED);
  4967. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0);
  4968. // if output is NULL, init from the input tok embed
  4969. if (output == NULL) {
  4970. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED);
  4971. }
  4972. for (int i = 0; i < n_layer; ++i) {
  4973. auto & layer = layers[i];
  4974. /*SSM LAYERS*/
  4975. // ssm in
  4976. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0);
  4977. // ssm 1d conv
  4978. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0);
  4979. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED);
  4980. // ssm_dt
  4981. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0);
  4982. // no "weight" suffix for these
  4983. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0);
  4984. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0);
  4985. // ssm_norm
  4986. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED);
  4987. // out_proj
  4988. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0);
  4989. /*ATTENTION LAYERS*/
  4990. // attention layers (with optional bias)
  4991. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {hidden_size, n_embd_head_k * attn_num_attention_head}, 0);
  4992. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0);
  4993. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0);
  4994. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
  4995. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
  4996. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {attn_num_key_value_head * n_embd_head_k}, TENSOR_NOT_REQUIRED);
  4997. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {attn_num_key_value_head * n_embd_head_v}, TENSOR_NOT_REQUIRED);
  4998. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
  4999. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
  5000. // feed forward (w/ optional biases)
  5001. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0);
  5002. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  5003. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
  5004. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_intermediate_size, hidden_size}, 0);
  5005. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
  5006. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
  5007. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
  5008. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
  5009. }
  5010. } break;
  5011. case LLM_ARCH_HUNYUAN_MOE:
  5012. {
  5013. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  5014. // output
  5015. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  5016. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  5017. // if output is NULL, init from the input tok embed
  5018. if (output == NULL) {
  5019. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  5020. }
  5021. for (int i = 0; i < n_layer; ++i) {
  5022. auto & layer = layers[i];
  5023. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  5024. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  5025. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  5026. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  5027. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  5028. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  5029. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  5030. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  5031. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  5032. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  5033. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  5034. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  5035. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  5036. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  5037. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
  5038. }
  5039. } break;
  5040. case LLM_ARCH_HUNYUAN_DENSE:
  5041. {
  5042. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  5043. // output
  5044. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  5045. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  5046. // if output is NULL, init from the input tok embed
  5047. if (output == NULL) {
  5048. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  5049. }
  5050. for (int i = 0; i < n_layer; ++i) {
  5051. auto & layer = layers[i];
  5052. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  5053. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  5054. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  5055. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  5056. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  5057. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  5058. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  5059. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  5060. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  5061. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  5062. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  5063. }
  5064. } break;
  5065. case LLM_ARCH_SMOLLM3:
  5066. {
  5067. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  5068. // output
  5069. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  5070. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  5071. // if output is NULL, init from the input tok embed
  5072. if (output == NULL) {
  5073. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  5074. }
  5075. for (int i = 0; i < n_layer; ++i) {
  5076. auto & layer = layers[i];
  5077. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  5078. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  5079. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  5080. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  5081. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  5082. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  5083. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  5084. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  5085. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  5086. }
  5087. } break;
  5088. case LLM_ARCH_OPENAI_MOE:
  5089. {
  5090. const int64_t n_ff_exp = hparams.n_ff_exp;
  5091. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  5092. // output
  5093. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  5094. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  5095. for (int i = 0; i < n_layer; ++i) {
  5096. auto & layer = layers[i];
  5097. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  5098. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  5099. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
  5100. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  5101. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  5102. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
  5103. layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
  5104. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
  5105. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  5106. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  5107. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  5108. // bias
  5109. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_head * n_rot}, 0);
  5110. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_head_kv * n_rot}, 0);
  5111. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_head_kv * n_rot}, 0);
  5112. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  5113. layer.ffn_gate_inp_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0);
  5114. layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
  5115. layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), { n_embd, n_expert}, 0);
  5116. layer.ffn_up_exps_b = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
  5117. }
  5118. } break;
  5119. case LLM_ARCH_LFM2:
  5120. case LLM_ARCH_LFM2MOE:
  5121. {
  5122. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  5123. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  5124. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  5125. if (output == NULL) {
  5126. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  5127. }
  5128. for (int i = 0; i < n_layer; ++i) {
  5129. auto & layer = layers[i];
  5130. const bool is_moe_layer = i >= static_cast<int>(hparams.n_layer_dense_lead);
  5131. // ffn/moe is same for transformer and conv layers
  5132. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  5133. if (is_moe_layer) {
  5134. GGML_ASSERT(n_expert && n_expert_used);
  5135. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  5136. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
  5137. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {hparams.n_ff_exp, n_embd, n_expert}, 0);
  5138. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
  5139. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
  5140. } else { // dense
  5141. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  5142. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  5143. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  5144. }
  5145. // for operator_norm
  5146. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  5147. if (!hparams.is_recurrent(i)) {
  5148. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  5149. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  5150. GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
  5151. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  5152. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, hparams.n_embd_k_gqa(i)}, 0);
  5153. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, hparams.n_embd_v_gqa(i)}, 0);
  5154. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  5155. } else {
  5156. layer.shortconv.conv = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV, "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0);
  5157. layer.shortconv.in_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ, "weight", i), {n_embd, 3 * n_embd}, 0);
  5158. layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
  5159. }
  5160. }
  5161. } break;
  5162. case LLM_ARCH_SMALLTHINKER:
  5163. {
  5164. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  5165. // output
  5166. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  5167. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  5168. // if output is NULL, init from the input tok embed
  5169. if (output == NULL) {
  5170. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  5171. }
  5172. for (int i = 0; i < n_layer; ++i) {
  5173. auto & layer = layers[i];
  5174. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  5175. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
  5176. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
  5177. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
  5178. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
  5179. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  5180. GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for SMALLTHINKER");
  5181. GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for SMALLTHINKER");
  5182. // MoE branch
  5183. const int64_t n_ff_exp = hparams.n_ff_exp;
  5184. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
  5185. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
  5186. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
  5187. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
  5188. }
  5189. } break;
  5190. case LLM_ARCH_GROVEMOE:
  5191. {
  5192. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  5193. // output
  5194. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  5195. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  5196. // if output is NULL, init from the input tok embed
  5197. if (output == NULL) {
  5198. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  5199. }
  5200. GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for GROVEMOE");
  5201. GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for GROVEMOE");
  5202. GGML_ASSERT(hparams.n_group_experts > 0 && "n_group_experts must be > 0 for GROVEMOE");
  5203. for (int i = 0; i < n_layer; ++i) {
  5204. auto & layer = layers[i];
  5205. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  5206. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  5207. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  5208. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  5209. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  5210. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  5211. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  5212. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  5213. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  5214. // MoE branch
  5215. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  5216. const int64_t n_ff_chexp = hparams.n_ff_chexp ? hparams.n_ff_chexp : n_embd_head_k;
  5217. const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
  5218. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  5219. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  5220. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  5221. layer.ffn_gate_chexps = create_tensor(tn(LLM_TENSOR_FFN_GATE_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0);
  5222. layer.ffn_down_chexps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_CHEXPS, "weight", i), {n_ff_chexp, n_embd, n_chunk_expert}, 0);
  5223. layer.ffn_up_chexps = create_tensor(tn(LLM_TENSOR_FFN_UP_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0);
  5224. }
  5225. } break;
  5226. case LLM_ARCH_APERTUS:
  5227. {
  5228. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  5229. // output
  5230. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  5231. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
  5232. for (int i = 0; i < n_layer; ++i) {
  5233. auto & layer = layers[i];
  5234. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  5235. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  5236. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  5237. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  5238. } else {
  5239. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  5240. }
  5241. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
  5242. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
  5243. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
  5244. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
  5245. // optional bias tensors
  5246. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
  5247. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
  5248. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
  5249. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
  5250. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  5251. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  5252. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
  5253. // Q and K layernorms for Apertus
  5254. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
  5255. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
  5256. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
  5257. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
  5258. }
  5259. } break;
  5260. case LLM_ARCH_MINIMAX_M2:
  5261. {
  5262. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  5263. // output
  5264. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  5265. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  5266. for (int i = 0; i < n_layer; ++i) {
  5267. auto & layer = layers[i];
  5268. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
  5269. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
  5270. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
  5271. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
  5272. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  5273. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k * n_head}, 0);
  5274. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_k_gqa}, 0);
  5275. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  5276. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  5277. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  5278. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  5279. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  5280. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
  5281. }
  5282. } break;
  5283. case LLM_ARCH_COGVLM:
  5284. {
  5285. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  5286. // output
  5287. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  5288. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  5289. // if output is NULL, init from the input tok embed
  5290. if (output == NULL) {
  5291. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  5292. }
  5293. for (int i = 0; i < n_layer; ++i) {
  5294. auto & layer = layers[i];
  5295. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  5296. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
  5297. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  5298. layer.visexp_attn_wqkv = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
  5299. layer.visexp_attn_wo = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  5300. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  5301. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  5302. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  5303. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  5304. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  5305. layer.visexp_ffn_gate = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  5306. layer.visexp_ffn_down = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  5307. layer.visexp_ffn_up = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  5308. }
  5309. } break;
  5310. case LLM_ARCH_PANGU_EMBED:
  5311. {
  5312. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  5313. // output
  5314. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  5315. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  5316. // if output is NULL, init from the input tok embed
  5317. if (output == NULL) {
  5318. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  5319. }
  5320. for (int i = 0; i < n_layer; ++i) {
  5321. auto & layer = layers[i];
  5322. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  5323. // weight tensors
  5324. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  5325. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  5326. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  5327. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  5328. // bias tensors
  5329. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd_head_k * n_head}, 0);
  5330. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  5331. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  5332. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  5333. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  5334. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  5335. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  5336. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  5337. } else {
  5338. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  5339. }
  5340. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  5341. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  5342. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  5343. }
  5344. } break;
  5345. default:
  5346. throw std::runtime_error("unknown architecture");
  5347. }
  5348. if (n_moved_tensors > 0) {
  5349. LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
  5350. __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
  5351. ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
  5352. }
  5353. }
  5354. ml.done_getting_tensors();
  5355. ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
  5356. pimpl->mappings.reserve(ml.mappings.size());
  5357. // create the backend buffers
  5358. std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps;
  5359. ctx_buf_maps.reserve(ctx_map.size());
  5360. // Ensure we have enough capacity for the maximum backend buffer we will potentially create
  5361. const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
  5362. pimpl->ctxs_bufs.reserve(n_max_backend_buffer);
  5363. for (auto & [buft, ctx_ptr] : ctx_map) {
  5364. ggml_context * ctx = ctx_ptr.get();
  5365. // skip contexts without tensors
  5366. if (ggml_get_first_tensor(ctx) == nullptr) {
  5367. continue;
  5368. }
  5369. llama_buf_map buf_map;
  5370. buf_map.reserve(n_max_backend_buffer);
  5371. // check if it is possible to use buffer_from_host_ptr with this buffer type
  5372. ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
  5373. if (!dev) {
  5374. // FIXME: workaround for CPU backend buft having a NULL device
  5375. dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  5376. if (!dev) {
  5377. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  5378. }
  5379. }
  5380. ggml_backend_dev_props props;
  5381. ggml_backend_dev_get_props(dev, &props);
  5382. bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
  5383. bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
  5384. std::vector<ggml_backend_buffer_ptr> bufs;
  5385. if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
  5386. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  5387. // only the mmap region containing the tensors in the model is mapped to the backend buffer
  5388. // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
  5389. // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
  5390. void * addr = nullptr;
  5391. size_t first, last; // NOLINT
  5392. ml.get_mapping_range(&first, &last, &addr, idx, ctx);
  5393. if (first >= last) {
  5394. continue;
  5395. }
  5396. const size_t max_size = ggml_get_max_tensor_size(ctx);
  5397. ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
  5398. if (buf == nullptr) {
  5399. throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
  5400. }
  5401. bufs.emplace_back(buf);
  5402. buf_map.emplace(idx, buf);
  5403. }
  5404. }
  5405. else {
  5406. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
  5407. if (buf == nullptr) {
  5408. throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
  5409. }
  5410. if (use_mlock && ggml_backend_buffer_is_host(buf)) {
  5411. pimpl->mlock_bufs.emplace_back(new llama_mlock);
  5412. auto & mlock_buf = pimpl->mlock_bufs.back();
  5413. mlock_buf->init (ggml_backend_buffer_get_base(buf));
  5414. mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
  5415. }
  5416. bufs.emplace_back(buf);
  5417. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  5418. buf_map.emplace(idx, buf);
  5419. }
  5420. }
  5421. pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), std::move(bufs));
  5422. for (auto & buf : buf_map) {
  5423. // indicate that this buffer contains weights
  5424. // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
  5425. ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
  5426. }
  5427. ctx_buf_maps.emplace_back(ctx, buf_map);
  5428. }
  5429. if (llama_supports_gpu_offload()) {
  5430. const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
  5431. LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
  5432. if (n_gpu_layers > (int) hparams.n_layer) {
  5433. LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
  5434. }
  5435. const int max_backend_supported_layers = hparams.n_layer + 1;
  5436. const int max_offloadable_layers = hparams.n_layer + 1;
  5437. LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
  5438. }
  5439. // print memory requirements per buffer type
  5440. for (auto & [_, bufs] : pimpl->ctxs_bufs) {
  5441. for (auto & buf: bufs) {
  5442. LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n",
  5443. __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
  5444. }
  5445. }
  5446. // populate tensors_by_name
  5447. for (auto & [ctx, _] : pimpl->ctxs_bufs) {
  5448. for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
  5449. tensors_by_name.emplace_back(ggml_get_name(cur), cur);
  5450. }
  5451. }
  5452. // load tensor data
  5453. for (auto & [ctx, buf_map] : ctx_buf_maps) {
  5454. if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
  5455. return false;
  5456. }
  5457. }
  5458. if (use_mmap_buffer) {
  5459. for (auto & mapping : ml.mappings) {
  5460. pimpl->mappings.emplace_back(std::move(mapping));
  5461. }
  5462. }
  5463. return true;
  5464. }
  5465. std::string llama_model::arch_name() const {
  5466. return llm_arch_name(arch);
  5467. }
  5468. std::string llama_model::type_name() const {
  5469. return llm_type_name(type);
  5470. }
  5471. std::string llama_model::desc() const {
  5472. return pimpl->desc_str;
  5473. }
  5474. size_t llama_model::size() const {
  5475. return pimpl->n_bytes;
  5476. }
  5477. size_t llama_model::n_tensors() const {
  5478. return tensors_by_name.size();
  5479. }
  5480. size_t llama_model::n_devices() const {
  5481. return devices.size();
  5482. }
  5483. std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
  5484. std::map<ggml_backend_buffer_type_t, size_t> ret;
  5485. for (const auto & [_, bufs] : pimpl->ctxs_bufs) {
  5486. for (const auto & buf : bufs) {
  5487. ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
  5488. }
  5489. }
  5490. return ret;
  5491. }
  5492. uint64_t llama_model::n_elements() const {
  5493. return pimpl->n_elements;
  5494. }
  5495. void llama_model::print_info() const {
  5496. const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
  5497. auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
  5498. bool is_var = false;
  5499. std::vector<uint32_t> v;
  5500. for (uint32_t i = 0; i < n; ++i) {
  5501. v.push_back(f(i));
  5502. if (v[i] != v[0]) {
  5503. is_var = true;
  5504. }
  5505. }
  5506. std::stringstream ss;
  5507. if (is_var) {
  5508. ss << "[";
  5509. for (uint32_t i = 0; i < n; ++i) {
  5510. ss << v[i];
  5511. if (i < n - 1) {
  5512. ss << ", ";
  5513. }
  5514. }
  5515. ss << "]";
  5516. } else {
  5517. ss << v[0];
  5518. }
  5519. return ss.str();
  5520. };
  5521. // hparams
  5522. LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
  5523. LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
  5524. if (!hparams.vocab_only) {
  5525. LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
  5526. LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
  5527. LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp());
  5528. LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
  5529. LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
  5530. LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
  5531. LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
  5532. LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
  5533. LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
  5534. LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
  5535. LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
  5536. LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
  5537. LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
  5538. LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
  5539. LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
  5540. LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
  5541. LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
  5542. LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
  5543. LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
  5544. LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
  5545. LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
  5546. LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
  5547. LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
  5548. LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
  5549. LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
  5550. LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
  5551. LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
  5552. LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
  5553. LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
  5554. LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
  5555. LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
  5556. LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
  5557. LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
  5558. // MRoPE (Multi-axis Rotary Position Embedding) sections
  5559. if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
  5560. LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
  5561. }
  5562. if (!classifier_labels.empty()) {
  5563. LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
  5564. size_t i = 0;
  5565. for (auto label : classifier_labels) {
  5566. LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
  5567. }
  5568. }
  5569. }
  5570. if (arch == LLM_ARCH_MAMBA ||
  5571. arch == LLM_ARCH_MAMBA2 ||
  5572. arch == LLM_ARCH_JAMBA ||
  5573. arch == LLM_ARCH_FALCON_H1 ||
  5574. arch == LLM_ARCH_PLAMO2 ||
  5575. arch == LLM_ARCH_GRANITE_HYBRID ||
  5576. arch == LLM_ARCH_NEMOTRON_H) {
  5577. LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
  5578. LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
  5579. LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
  5580. LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
  5581. LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
  5582. LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
  5583. }
  5584. LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
  5585. if (pimpl->n_elements >= 1e12) {
  5586. LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
  5587. } else if (pimpl->n_elements >= 1e9) {
  5588. LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
  5589. } else if (pimpl->n_elements >= 1e6) {
  5590. LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
  5591. } else {
  5592. LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
  5593. }
  5594. // general kv
  5595. LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
  5596. if (arch == LLM_ARCH_DEEPSEEK) {
  5597. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  5598. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5599. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  5600. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  5601. }
  5602. if (arch == LLM_ARCH_DEEPSEEK2) {
  5603. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  5604. LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
  5605. LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
  5606. LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
  5607. LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
  5608. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5609. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  5610. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  5611. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  5612. LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
  5613. LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
  5614. }
  5615. if (arch == LLM_ARCH_QWEN2MOE) {
  5616. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5617. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  5618. }
  5619. if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
  5620. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5621. }
  5622. if (arch == LLM_ARCH_MINICPM ||
  5623. arch == LLM_ARCH_GRANITE ||
  5624. arch == LLM_ARCH_GRANITE_MOE ||
  5625. arch == LLM_ARCH_GRANITE_HYBRID) {
  5626. LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
  5627. LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
  5628. LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
  5629. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  5630. }
  5631. if (arch == LLM_ARCH_BAILINGMOE) {
  5632. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  5633. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5634. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  5635. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  5636. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  5637. }
  5638. if (arch == LLM_ARCH_BAILINGMOE2) {
  5639. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  5640. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5641. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  5642. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  5643. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  5644. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  5645. LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
  5646. LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
  5647. }
  5648. if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
  5649. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5650. LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
  5651. }
  5652. if (arch == LLM_ARCH_GROVEMOE) {
  5653. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5654. LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
  5655. LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
  5656. LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
  5657. }
  5658. vocab.print_info();
  5659. }
  5660. ggml_backend_dev_t llama_model::dev_layer(int il) const {
  5661. return pimpl->dev_layer.at(il).dev;
  5662. }
  5663. ggml_backend_dev_t llama_model::dev_output() const {
  5664. return pimpl->dev_output.dev;
  5665. }
  5666. template<typename F>
  5667. static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
  5668. ggml_init_params params = {
  5669. /*.mem_size =*/ ggml_tensor_overhead()*8,
  5670. /*.mem_buffer =*/ NULL,
  5671. /*.no_alloc =*/ true,
  5672. };
  5673. ggml_context_ptr ctx { ggml_init(params) };
  5674. if (!ctx) {
  5675. throw std::runtime_error(format("failed to create ggml context"));
  5676. }
  5677. ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
  5678. ggml_tensor * op_tensor = fn(ctx.get());
  5679. for (int i = 0; i < GGML_MAX_SRC; i++) {
  5680. if (op_tensor->src[i] != nullptr) {
  5681. assert(op_tensor->src[i]->buffer == nullptr);
  5682. op_tensor->src[i]->buffer = buf.get();
  5683. }
  5684. }
  5685. bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  5686. return op_supported;
  5687. }
  5688. template<typename F>
  5689. static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
  5690. for (const auto & cur : buft_list) {
  5691. ggml_backend_dev_t cur_dev = cur.first;
  5692. ggml_backend_buffer_type_t cur_buft = cur.second;
  5693. if (buft_supported(cur_buft, cur_dev, fn)) {
  5694. return cur_buft;
  5695. }
  5696. }
  5697. throw std::runtime_error(format("no suitable buffer type found"));
  5698. }
  5699. ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
  5700. return ::select_buft(
  5701. *pimpl->dev_layer.at(il).buft_list,
  5702. [&](ggml_context * ctx) {
  5703. ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  5704. ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  5705. return ggml_add(ctx, cur, layer_dir);
  5706. });
  5707. }
  5708. bool llama_model::has_tensor_overrides() const {
  5709. return pimpl->has_tensor_overrides;
  5710. }
  5711. const ggml_tensor * llama_model::get_tensor(const char * name) const {
  5712. auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
  5713. [name](const std::pair<std::string, ggml_tensor *> & it) {
  5714. return it.first == name;
  5715. });
  5716. if (it == tensors_by_name.end()) {
  5717. return nullptr;
  5718. }
  5719. return it->second;
  5720. }
  5721. float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
  5722. return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
  5723. }
  5724. float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
  5725. return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
  5726. }
  5727. ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
  5728. const uint32_t n_ctx_seq = cparams.n_ctx_seq;
  5729. // choose long/short freq factors based on the context size
  5730. if (layers[il].rope_freqs != nullptr) {
  5731. return layers[il].rope_freqs;
  5732. }
  5733. if (n_ctx_seq > hparams.n_ctx_orig_yarn) {
  5734. return layers[il].rope_long;
  5735. }
  5736. return layers[il].rope_short;
  5737. }
  5738. llama_memory_i * llama_model::create_memory(const llama_memory_params & params, const llama_cparams & cparams) const {
  5739. llama_memory_i * res;
  5740. switch (arch) {
  5741. // Models that need specific instantiation should be handled in the
  5742. // switch statement
  5743. case LLM_ARCH_BERT:
  5744. case LLM_ARCH_JINA_BERT_V2:
  5745. case LLM_ARCH_JINA_BERT_V3:
  5746. case LLM_ARCH_NOMIC_BERT:
  5747. case LLM_ARCH_NOMIC_BERT_MOE:
  5748. case LLM_ARCH_NEO_BERT:
  5749. case LLM_ARCH_WAVTOKENIZER_DEC:
  5750. case LLM_ARCH_GEMMA_EMBEDDING:
  5751. case LLM_ARCH_DREAM:
  5752. case LLM_ARCH_LLADA:
  5753. case LLM_ARCH_LLADA_MOE:
  5754. case LLM_ARCH_RND1:
  5755. {
  5756. res = nullptr;
  5757. } break;
  5758. // Models that need standard caching should rely on recurrent/hybrid
  5759. // checks
  5760. default:
  5761. {
  5762. if (llm_arch_is_recurrent(arch)) {
  5763. res = new llama_memory_recurrent(
  5764. *this,
  5765. GGML_TYPE_F32,
  5766. GGML_TYPE_F32,
  5767. cparams.offload_kqv,
  5768. std::max((uint32_t) 1, cparams.n_seq_max),
  5769. cparams.n_seq_max,
  5770. nullptr);
  5771. } else if (llm_arch_is_hybrid(arch)) {
  5772. // The main difference between hybrid architectures is the
  5773. // layer filters, so pick the right one here
  5774. llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
  5775. llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
  5776. if (arch == LLM_ARCH_FALCON_H1) {
  5777. filter_attn = [&](int32_t) { return true; };
  5778. filter_recr = [&](int32_t) { return true; };
  5779. } else if (arch == LLM_ARCH_NEMOTRON_H) {
  5780. filter_attn = [&](int32_t il) {
  5781. return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
  5782. };
  5783. filter_recr = [&](int32_t il) {
  5784. return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
  5785. };
  5786. }
  5787. res = new llama_memory_hybrid(
  5788. /* model */ *this,
  5789. /* attn_type_k */ params.type_k,
  5790. /* attn_type_v */ params.type_v,
  5791. /* attn_v_trans */ !cparams.flash_attn,
  5792. /* attn_kv_size */ cparams.n_ctx,
  5793. /* attn_n_pad */ 1,
  5794. /* attn_n_swa */ hparams.n_swa,
  5795. /* attn_swa_type */ hparams.swa_type,
  5796. /* recurrent_type_k */ GGML_TYPE_F32,
  5797. /* recurrent_type_v */ GGML_TYPE_F32,
  5798. /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
  5799. /* n_seq_max */ cparams.n_seq_max,
  5800. /* offload */ cparams.offload_kqv,
  5801. /* unified */ cparams.kv_unified,
  5802. /* filter_attn */ std::move(filter_attn),
  5803. /* filter_recr */ std::move(filter_recr));
  5804. } else {
  5805. llama_memory_i::layer_reuse_cb reuse = nullptr;
  5806. if (arch == LLM_ARCH_GEMMA3N) {
  5807. reuse = [&](int32_t il) {
  5808. if (il >= (int32_t) hparams.n_layer_kv_from_start) {
  5809. return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
  5810. }
  5811. return -1;
  5812. };
  5813. }
  5814. if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
  5815. GGML_ASSERT(hparams.is_swa_any());
  5816. res = new llama_kv_cache_iswa(
  5817. *this,
  5818. params.type_k,
  5819. params.type_v,
  5820. !cparams.flash_attn,
  5821. cparams.offload_kqv,
  5822. params.swa_full,
  5823. cparams.kv_unified,
  5824. cparams.n_ctx_seq,
  5825. cparams.n_seq_max,
  5826. cparams.n_ubatch,
  5827. 1,
  5828. nullptr,
  5829. reuse);
  5830. } else {
  5831. GGML_ASSERT(!hparams.is_swa_any());
  5832. res = new llama_kv_cache(
  5833. *this,
  5834. params.type_k,
  5835. params.type_v,
  5836. !cparams.flash_attn,
  5837. cparams.offload_kqv,
  5838. cparams.kv_unified,
  5839. cparams.n_ctx_seq,
  5840. cparams.n_seq_max,
  5841. 1,
  5842. hparams.n_swa,
  5843. hparams.swa_type,
  5844. nullptr,
  5845. nullptr);
  5846. }
  5847. }
  5848. }
  5849. }
  5850. return res;
  5851. }
  5852. ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
  5853. std::unique_ptr<llm_graph_context> llm;
  5854. switch (arch) {
  5855. case LLM_ARCH_LLAMA:
  5856. {
  5857. llm = std::make_unique<llm_build_llama>(*this, params);
  5858. } break;
  5859. case LLM_ARCH_LLAMA4:
  5860. {
  5861. if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
  5862. llm = std::make_unique<llm_build_llama>(*this, params);
  5863. } else {
  5864. llm = std::make_unique<llm_build_llama_iswa>(*this, params);
  5865. }
  5866. } break;
  5867. case LLM_ARCH_DECI:
  5868. {
  5869. llm = std::make_unique<llm_build_deci>(*this, params);
  5870. } break;
  5871. case LLM_ARCH_BAICHUAN:
  5872. {
  5873. llm = std::make_unique<llm_build_baichuan>(*this, params);
  5874. } break;
  5875. case LLM_ARCH_FALCON:
  5876. {
  5877. llm = std::make_unique<llm_build_falcon>(*this, params);
  5878. } break;
  5879. case LLM_ARCH_GROK:
  5880. {
  5881. llm = std::make_unique<llm_build_grok>(*this, params);
  5882. } break;
  5883. case LLM_ARCH_STARCODER:
  5884. {
  5885. llm = std::make_unique<llm_build_starcoder>(*this, params);
  5886. } break;
  5887. case LLM_ARCH_REFACT:
  5888. {
  5889. llm = std::make_unique<llm_build_refact>(*this, params);
  5890. } break;
  5891. case LLM_ARCH_BERT:
  5892. case LLM_ARCH_JINA_BERT_V2:
  5893. case LLM_ARCH_JINA_BERT_V3:
  5894. case LLM_ARCH_NOMIC_BERT:
  5895. case LLM_ARCH_NOMIC_BERT_MOE:
  5896. {
  5897. llm = std::make_unique<llm_build_bert>(*this, params);
  5898. } break;
  5899. case LLM_ARCH_NEO_BERT:
  5900. {
  5901. llm = std::make_unique<llm_build_neo_bert>(*this, params);
  5902. } break;
  5903. case LLM_ARCH_BLOOM:
  5904. {
  5905. llm = std::make_unique<llm_build_bloom>(*this, params);
  5906. } break;
  5907. case LLM_ARCH_MPT:
  5908. {
  5909. llm = std::make_unique<llm_build_mpt>(*this, params);
  5910. } break;
  5911. case LLM_ARCH_STABLELM:
  5912. {
  5913. llm = std::make_unique<llm_build_stablelm>(*this, params);
  5914. } break;
  5915. case LLM_ARCH_QWEN:
  5916. {
  5917. llm = std::make_unique<llm_build_qwen>(*this, params);
  5918. } break;
  5919. case LLM_ARCH_QWEN2:
  5920. {
  5921. llm = std::make_unique<llm_build_qwen2>(*this, params);
  5922. } break;
  5923. case LLM_ARCH_DREAM:
  5924. {
  5925. llm = std::make_unique<llm_build_dream>(*this, params);
  5926. }
  5927. break;
  5928. case LLM_ARCH_LLADA:
  5929. {
  5930. llm = std::make_unique<llm_build_llada>(*this, params);
  5931. }
  5932. break;
  5933. case LLM_ARCH_LLADA_MOE:
  5934. {
  5935. llm = std::make_unique<llm_build_llada_moe>(*this, params);
  5936. }
  5937. break;
  5938. case LLM_ARCH_RND1:
  5939. {
  5940. llm = std::make_unique<llm_build_rnd1>(*this, params);
  5941. }
  5942. break;
  5943. case LLM_ARCH_QWEN2VL:
  5944. {
  5945. llm = std::make_unique<llm_build_qwen2vl>(*this, params);
  5946. } break;
  5947. case LLM_ARCH_QWEN2MOE:
  5948. {
  5949. llm = std::make_unique<llm_build_qwen2moe>(*this, params);
  5950. } break;
  5951. case LLM_ARCH_QWEN3:
  5952. {
  5953. llm = std::make_unique<llm_build_qwen3>(*this, params);
  5954. } break;
  5955. case LLM_ARCH_QWEN3MOE:
  5956. {
  5957. llm = std::make_unique<llm_build_qwen3moe>(*this, params);
  5958. } break;
  5959. case LLM_ARCH_QWEN3VL:
  5960. {
  5961. llm = std::make_unique<llm_build_qwen3vl>(*this, params);
  5962. } break;
  5963. case LLM_ARCH_QWEN3VLMOE:
  5964. {
  5965. llm = std::make_unique<llm_build_qwen3vlmoe>(*this, params);
  5966. } break;
  5967. case LLM_ARCH_PHI2:
  5968. {
  5969. llm = std::make_unique<llm_build_phi2>(*this, params);
  5970. } break;
  5971. case LLM_ARCH_PHI3:
  5972. case LLM_ARCH_PHIMOE:
  5973. {
  5974. if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
  5975. llm = std::make_unique<llm_build_phi3<true>> (*this, params);
  5976. } else {
  5977. llm = std::make_unique<llm_build_phi3<false>>(*this, params);
  5978. }
  5979. } break;
  5980. case LLM_ARCH_PLAMO:
  5981. {
  5982. llm = std::make_unique<llm_build_plamo>(*this, params);
  5983. } break;
  5984. case LLM_ARCH_PLAMO2:
  5985. {
  5986. llm = std::make_unique<llm_build_plamo2>(*this, params);
  5987. } break;
  5988. case LLM_ARCH_GPT2:
  5989. {
  5990. llm = std::make_unique<llm_build_gpt2>(*this, params);
  5991. } break;
  5992. case LLM_ARCH_CODESHELL:
  5993. {
  5994. llm = std::make_unique<llm_build_codeshell>(*this, params);
  5995. } break;
  5996. case LLM_ARCH_ORION:
  5997. {
  5998. llm = std::make_unique<llm_build_orion>(*this, params);
  5999. } break;
  6000. case LLM_ARCH_INTERNLM2:
  6001. {
  6002. llm = std::make_unique<llm_build_internlm2>(*this, params);
  6003. } break;
  6004. case LLM_ARCH_MINICPM3:
  6005. {
  6006. llm = std::make_unique<llm_build_minicpm3>(*this, params);
  6007. } break;
  6008. case LLM_ARCH_GEMMA:
  6009. {
  6010. llm = std::make_unique<llm_build_gemma>(*this, params);
  6011. } break;
  6012. case LLM_ARCH_GEMMA2:
  6013. {
  6014. llm = std::make_unique<llm_build_gemma2_iswa>(*this, params);
  6015. } break;
  6016. case LLM_ARCH_GEMMA3:
  6017. {
  6018. llm = std::make_unique<llm_build_gemma3_iswa>(*this, params);
  6019. } break;
  6020. case LLM_ARCH_GEMMA3N:
  6021. {
  6022. llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
  6023. } break;
  6024. case LLM_ARCH_GEMMA_EMBEDDING:
  6025. {
  6026. llm = std::make_unique<llm_build_gemma_embedding>(*this, params);
  6027. } break;
  6028. case LLM_ARCH_STARCODER2:
  6029. {
  6030. llm = std::make_unique<llm_build_starcoder2>(*this, params);
  6031. } break;
  6032. case LLM_ARCH_MAMBA:
  6033. case LLM_ARCH_MAMBA2:
  6034. {
  6035. llm = std::make_unique<llm_build_mamba>(*this, params);
  6036. } break;
  6037. case LLM_ARCH_JAMBA:
  6038. {
  6039. llm = std::make_unique<llm_build_jamba>(*this, params);
  6040. } break;
  6041. case LLM_ARCH_XVERSE:
  6042. {
  6043. llm = std::make_unique<llm_build_xverse>(*this, params);
  6044. } break;
  6045. case LLM_ARCH_COMMAND_R:
  6046. {
  6047. llm = std::make_unique<llm_build_command_r>(*this, params);
  6048. } break;
  6049. case LLM_ARCH_COHERE2:
  6050. {
  6051. llm = std::make_unique<llm_build_cohere2_iswa>(*this, params);
  6052. } break;
  6053. case LLM_ARCH_DBRX:
  6054. {
  6055. llm = std::make_unique<llm_build_dbrx>(*this, params);
  6056. } break;
  6057. case LLM_ARCH_OLMO:
  6058. {
  6059. llm = std::make_unique<llm_build_olmo>(*this, params);
  6060. } break;
  6061. case LLM_ARCH_OLMO2:
  6062. {
  6063. if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
  6064. llm = std::make_unique<llm_build_olmo2<true>>(*this, params);
  6065. } else {
  6066. llm = std::make_unique<llm_build_olmo2<false>>(*this, params);
  6067. }
  6068. } break;
  6069. case LLM_ARCH_OLMOE:
  6070. {
  6071. llm = std::make_unique<llm_build_olmoe>(*this, params);
  6072. } break;
  6073. case LLM_ARCH_OPENELM:
  6074. {
  6075. llm = std::make_unique<llm_build_openelm>(*this, params);
  6076. } break;
  6077. case LLM_ARCH_GPTNEOX:
  6078. {
  6079. llm = std::make_unique<llm_build_gptneox>(*this, params);
  6080. } break;
  6081. case LLM_ARCH_ARCTIC:
  6082. {
  6083. llm = std::make_unique<llm_build_arctic>(*this, params);
  6084. } break;
  6085. case LLM_ARCH_DEEPSEEK:
  6086. {
  6087. llm = std::make_unique<llm_build_deepseek>(*this, params);
  6088. } break;
  6089. case LLM_ARCH_DEEPSEEK2:
  6090. {
  6091. llm = std::make_unique<llm_build_deepseek2>(*this, params);
  6092. } break;
  6093. case LLM_ARCH_CHATGLM:
  6094. {
  6095. llm = std::make_unique<llm_build_chatglm>(*this, params);
  6096. } break;
  6097. case LLM_ARCH_GLM4:
  6098. {
  6099. llm = std::make_unique<llm_build_glm4>(*this, params);
  6100. } break;
  6101. case LLM_ARCH_GLM4_MOE:
  6102. {
  6103. llm = std::make_unique<llm_build_glm4_moe>(*this, params);
  6104. } break;
  6105. case LLM_ARCH_BITNET:
  6106. {
  6107. llm = std::make_unique<llm_build_bitnet>(*this, params);
  6108. } break;
  6109. case LLM_ARCH_T5:
  6110. {
  6111. switch (params.gtype) {
  6112. case LLM_GRAPH_TYPE_ENCODER:
  6113. llm = std::make_unique<llm_build_t5_enc>(*this, params);
  6114. break;
  6115. case LLM_GRAPH_TYPE_DEFAULT:
  6116. case LLM_GRAPH_TYPE_DECODER:
  6117. llm = std::make_unique<llm_build_t5_dec>(*this, params);
  6118. break;
  6119. default:
  6120. GGML_ABORT("invalid graph type");
  6121. };
  6122. } break;
  6123. case LLM_ARCH_T5ENCODER:
  6124. {
  6125. llm = std::make_unique<llm_build_t5_enc>(*this, params);
  6126. }
  6127. break;
  6128. case LLM_ARCH_JAIS:
  6129. {
  6130. llm = std::make_unique<llm_build_jais>(*this, params);
  6131. } break;
  6132. case LLM_ARCH_NEMOTRON:
  6133. {
  6134. llm = std::make_unique<llm_build_nemotron>(*this, params);
  6135. } break;
  6136. case LLM_ARCH_NEMOTRON_H:
  6137. {
  6138. llm = std::make_unique<llm_build_nemotron_h>(*this, params);
  6139. } break;
  6140. case LLM_ARCH_EXAONE:
  6141. {
  6142. llm = std::make_unique<llm_build_exaone>(*this, params);
  6143. } break;
  6144. case LLM_ARCH_EXAONE4:
  6145. {
  6146. if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
  6147. llm = std::make_unique<llm_build_exaone4<true>>(*this, params);
  6148. } else {
  6149. llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
  6150. }
  6151. } break;
  6152. case LLM_ARCH_RWKV6:
  6153. {
  6154. llm = std::make_unique<llm_build_rwkv6>(*this, params);
  6155. } break;
  6156. case LLM_ARCH_RWKV6QWEN2:
  6157. {
  6158. llm = std::make_unique<llm_build_rwkv6qwen2>(*this, params);
  6159. } break;
  6160. case LLM_ARCH_RWKV7:
  6161. {
  6162. llm = std::make_unique<llm_build_rwkv7>(*this, params);
  6163. } break;
  6164. case LLM_ARCH_ARWKV7:
  6165. {
  6166. llm = std::make_unique<llm_build_arwkv7>(*this, params);
  6167. } break;
  6168. case LLM_ARCH_GRANITE:
  6169. case LLM_ARCH_GRANITE_MOE:
  6170. case LLM_ARCH_MINICPM:
  6171. {
  6172. llm = std::make_unique<llm_build_granite>(*this, params);
  6173. } break;
  6174. case LLM_ARCH_GRANITE_HYBRID:
  6175. {
  6176. llm = std::make_unique<llm_build_granite_hybrid>(*this, params);
  6177. } break;
  6178. case LLM_ARCH_CHAMELEON:
  6179. {
  6180. llm = std::make_unique<llm_build_chameleon>(*this, params);
  6181. } break;
  6182. case LLM_ARCH_WAVTOKENIZER_DEC:
  6183. {
  6184. llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
  6185. } break;
  6186. case LLM_ARCH_PLM:
  6187. {
  6188. llm = std::make_unique<llm_build_plm>(*this, params);
  6189. } break;
  6190. case LLM_ARCH_BAILINGMOE:
  6191. {
  6192. llm = std::make_unique<llm_build_bailingmoe>(*this, params);
  6193. } break;
  6194. case LLM_ARCH_BAILINGMOE2:
  6195. {
  6196. llm = std::make_unique<llm_build_bailingmoe2>(*this, params);
  6197. } break;
  6198. case LLM_ARCH_SEED_OSS:
  6199. {
  6200. llm = std::make_unique<llm_build_seed_oss>(*this, params);
  6201. } break;
  6202. case LLM_ARCH_DOTS1:
  6203. {
  6204. llm = std::make_unique<llm_build_dots1>(*this, params);
  6205. } break;
  6206. case LLM_ARCH_ARCEE:
  6207. {
  6208. llm = std::make_unique<llm_build_arcee>(*this, params);
  6209. } break;
  6210. case LLM_ARCH_AFMOE:
  6211. {
  6212. llm = std::make_unique<llm_build_afmoe>(*this, params);
  6213. } break;
  6214. case LLM_ARCH_ERNIE4_5:
  6215. {
  6216. llm = std::make_unique<llm_build_ernie4_5>(*this, params);
  6217. } break;
  6218. case LLM_ARCH_ERNIE4_5_MOE:
  6219. {
  6220. llm = std::make_unique<llm_build_ernie4_5_moe>(*this, params);
  6221. } break;
  6222. case LLM_ARCH_HUNYUAN_MOE:
  6223. {
  6224. llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
  6225. } break;
  6226. case LLM_ARCH_HUNYUAN_DENSE:
  6227. {
  6228. llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
  6229. } break;
  6230. case LLM_ARCH_SMOLLM3:
  6231. {
  6232. llm = std::make_unique<llm_build_smollm3>(*this, params);
  6233. } break;
  6234. case LLM_ARCH_OPENAI_MOE:
  6235. {
  6236. llm = std::make_unique<llm_build_openai_moe_iswa>(*this, params);
  6237. } break;
  6238. case LLM_ARCH_FALCON_H1:
  6239. {
  6240. llm = std::make_unique<llm_build_falcon_h1>(*this, params);
  6241. } break;
  6242. case LLM_ARCH_LFM2:
  6243. case LLM_ARCH_LFM2MOE:
  6244. {
  6245. llm = std::make_unique<llm_build_lfm2>(*this, params);
  6246. } break;
  6247. case LLM_ARCH_SMALLTHINKER:
  6248. {
  6249. if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
  6250. llm = std::make_unique<llm_build_smallthinker<true>> (*this, params);
  6251. } else {
  6252. llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
  6253. }
  6254. } break;
  6255. case LLM_ARCH_GROVEMOE:
  6256. {
  6257. llm = std::make_unique<llm_build_grovemoe>(*this, params);
  6258. } break;
  6259. case LLM_ARCH_APERTUS:
  6260. {
  6261. llm = std::make_unique<llm_build_apertus>(*this, params);
  6262. } break;
  6263. case LLM_ARCH_MINIMAX_M2:
  6264. {
  6265. llm = std::make_unique<llm_build_minimax_m2>(*this, params);
  6266. } break;
  6267. case LLM_ARCH_COGVLM:
  6268. {
  6269. llm = std::make_unique<llm_build_cogvlm>(*this, params);
  6270. } break;
  6271. case LLM_ARCH_PANGU_EMBED:
  6272. {
  6273. llm = std::make_unique<llm_build_pangu_embedded>(*this, params);
  6274. }break;
  6275. default:
  6276. GGML_ABORT("fatal error");
  6277. }
  6278. // add on pooling layer
  6279. llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
  6280. // if the gguf model was converted with --sentence-transformers-dense-modules
  6281. // there will be two additional dense projection layers
  6282. // dense linear projections are applied after pooling
  6283. // TODO: move reranking logic here and generalize
  6284. llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
  6285. return llm->res->get_gf();
  6286. }
  6287. //
  6288. // interface implementation
  6289. //
  6290. llama_model_params llama_model_default_params() {
  6291. llama_model_params result = {
  6292. /*.devices =*/ nullptr,
  6293. /*.tensor_buft_overrides =*/ nullptr,
  6294. /*.n_gpu_layers =*/ 999,
  6295. /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
  6296. /*.main_gpu =*/ 0,
  6297. /*.tensor_split =*/ nullptr,
  6298. /*.progress_callback =*/ nullptr,
  6299. /*.progress_callback_user_data =*/ nullptr,
  6300. /*.kv_overrides =*/ nullptr,
  6301. /*.vocab_only =*/ false,
  6302. /*.use_mmap =*/ true,
  6303. /*.use_mlock =*/ false,
  6304. /*.check_tensors =*/ false,
  6305. /*.use_extra_bufts =*/ true,
  6306. /*.no_host =*/ false,
  6307. };
  6308. return result;
  6309. }
  6310. const llama_vocab * llama_model_get_vocab(const llama_model * model) {
  6311. return &model->vocab;
  6312. }
  6313. void llama_free_model(llama_model * model) {
  6314. llama_model_free(model);
  6315. }
  6316. void llama_model_free(llama_model * model) {
  6317. delete model;
  6318. }
  6319. int32_t llama_model_n_ctx_train(const llama_model * model) {
  6320. return model->hparams.n_ctx_train;
  6321. }
  6322. int32_t llama_model_n_embd(const llama_model * model) {
  6323. return model->hparams.n_embd;
  6324. }
  6325. int32_t llama_model_n_embd_inp(const llama_model * model) {
  6326. return model->hparams.n_embd_inp();
  6327. }
  6328. int32_t llama_model_n_layer(const llama_model * model) {
  6329. return model->hparams.n_layer;
  6330. }
  6331. int32_t llama_model_n_head(const llama_model * model) {
  6332. return model->hparams.n_head();
  6333. }
  6334. int32_t llama_model_n_head_kv(const llama_model * model) {
  6335. return model->hparams.n_head_kv();
  6336. }
  6337. int32_t llama_model_n_swa(const llama_model * model) {
  6338. return model->hparams.n_swa;
  6339. }
  6340. uint32_t llama_model_n_cls_out(const struct llama_model * model) {
  6341. return model->hparams.n_cls_out;
  6342. }
  6343. const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
  6344. if (i < model->classifier_labels.size()) {
  6345. return model->classifier_labels[i].c_str();
  6346. }
  6347. return nullptr;
  6348. }
  6349. // deprecated
  6350. int32_t llama_n_ctx_train(const llama_model * model) {
  6351. return llama_model_n_ctx_train(model);
  6352. }
  6353. // deprecated
  6354. int32_t llama_n_embd(const llama_model * model) {
  6355. return llama_model_n_embd(model);
  6356. }
  6357. // deprecated
  6358. int32_t llama_n_layer(const llama_model * model) {
  6359. return llama_model_n_layer(model);
  6360. }
  6361. // deprecated
  6362. int32_t llama_n_head(const llama_model * model) {
  6363. return llama_model_n_head(model);
  6364. }
  6365. llama_rope_type llama_model_rope_type(const llama_model * model) {
  6366. switch (model->arch) {
  6367. // these models do not use RoPE
  6368. case LLM_ARCH_CLIP:
  6369. case LLM_ARCH_GPT2:
  6370. case LLM_ARCH_GPTJ:
  6371. case LLM_ARCH_MPT:
  6372. case LLM_ARCH_REFACT:
  6373. case LLM_ARCH_BLOOM:
  6374. case LLM_ARCH_MAMBA:
  6375. case LLM_ARCH_MAMBA2:
  6376. case LLM_ARCH_JAMBA:
  6377. case LLM_ARCH_JINA_BERT_V2:
  6378. case LLM_ARCH_T5:
  6379. case LLM_ARCH_T5ENCODER:
  6380. case LLM_ARCH_JAIS:
  6381. case LLM_ARCH_RWKV6:
  6382. case LLM_ARCH_RWKV6QWEN2:
  6383. case LLM_ARCH_RWKV7:
  6384. case LLM_ARCH_ARWKV7:
  6385. case LLM_ARCH_WAVTOKENIZER_DEC:
  6386. case LLM_ARCH_NEMOTRON_H:
  6387. return LLAMA_ROPE_TYPE_NONE;
  6388. // use what we call a normal RoPE, operating on pairs of consecutive head values
  6389. case LLM_ARCH_LLAMA:
  6390. case LLM_ARCH_LLADA:
  6391. case LLM_ARCH_LLAMA4:
  6392. case LLM_ARCH_DECI:
  6393. case LLM_ARCH_BAICHUAN:
  6394. case LLM_ARCH_STARCODER:
  6395. case LLM_ARCH_INTERNLM2:
  6396. case LLM_ARCH_MINICPM:
  6397. case LLM_ARCH_XVERSE:
  6398. case LLM_ARCH_COMMAND_R:
  6399. case LLM_ARCH_COHERE2:
  6400. case LLM_ARCH_OLMO:
  6401. case LLM_ARCH_ARCTIC:
  6402. case LLM_ARCH_DEEPSEEK:
  6403. case LLM_ARCH_DEEPSEEK2:
  6404. case LLM_ARCH_PLM:
  6405. case LLM_ARCH_CHATGLM:
  6406. case LLM_ARCH_GLM4:
  6407. case LLM_ARCH_GRANITE:
  6408. case LLM_ARCH_GRANITE_MOE:
  6409. case LLM_ARCH_GRANITE_HYBRID:
  6410. case LLM_ARCH_CHAMELEON:
  6411. case LLM_ARCH_BAILINGMOE:
  6412. case LLM_ARCH_NEO_BERT:
  6413. case LLM_ARCH_SMOLLM3:
  6414. case LLM_ARCH_ARCEE:
  6415. case LLM_ARCH_ERNIE4_5:
  6416. case LLM_ARCH_ERNIE4_5_MOE:
  6417. return LLAMA_ROPE_TYPE_NORM;
  6418. // the pairs of head values are offset by n_rot/2
  6419. case LLM_ARCH_FALCON:
  6420. case LLM_ARCH_FALCON_H1:
  6421. case LLM_ARCH_GROK:
  6422. case LLM_ARCH_DBRX:
  6423. case LLM_ARCH_BERT:
  6424. case LLM_ARCH_JINA_BERT_V3:
  6425. case LLM_ARCH_NOMIC_BERT:
  6426. case LLM_ARCH_NOMIC_BERT_MOE:
  6427. case LLM_ARCH_STABLELM:
  6428. case LLM_ARCH_BITNET:
  6429. case LLM_ARCH_QWEN:
  6430. case LLM_ARCH_QWEN2:
  6431. case LLM_ARCH_DREAM:
  6432. case LLM_ARCH_QWEN2MOE:
  6433. case LLM_ARCH_QWEN3:
  6434. case LLM_ARCH_QWEN3MOE:
  6435. case LLM_ARCH_LLADA_MOE:
  6436. case LLM_ARCH_RND1:
  6437. case LLM_ARCH_OLMO2:
  6438. case LLM_ARCH_OLMOE:
  6439. case LLM_ARCH_PHI2:
  6440. case LLM_ARCH_PHI3:
  6441. case LLM_ARCH_PHIMOE:
  6442. case LLM_ARCH_PLAMO:
  6443. case LLM_ARCH_PLAMO2:
  6444. case LLM_ARCH_GEMMA:
  6445. case LLM_ARCH_GEMMA2:
  6446. case LLM_ARCH_GEMMA3:
  6447. case LLM_ARCH_GEMMA3N:
  6448. case LLM_ARCH_GEMMA_EMBEDDING:
  6449. case LLM_ARCH_STARCODER2:
  6450. case LLM_ARCH_OPENELM:
  6451. case LLM_ARCH_GPTNEOX:
  6452. case LLM_ARCH_CODESHELL:
  6453. case LLM_ARCH_ORION:
  6454. case LLM_ARCH_NEMOTRON:
  6455. case LLM_ARCH_EXAONE:
  6456. case LLM_ARCH_EXAONE4:
  6457. case LLM_ARCH_MINICPM3:
  6458. case LLM_ARCH_BAILINGMOE2:
  6459. case LLM_ARCH_DOTS1:
  6460. case LLM_ARCH_HUNYUAN_MOE:
  6461. case LLM_ARCH_OPENAI_MOE:
  6462. case LLM_ARCH_HUNYUAN_DENSE:
  6463. case LLM_ARCH_LFM2:
  6464. case LLM_ARCH_LFM2MOE:
  6465. case LLM_ARCH_SMALLTHINKER:
  6466. case LLM_ARCH_GLM4_MOE:
  6467. case LLM_ARCH_SEED_OSS:
  6468. case LLM_ARCH_GROVEMOE:
  6469. case LLM_ARCH_APERTUS:
  6470. case LLM_ARCH_MINIMAX_M2:
  6471. case LLM_ARCH_COGVLM:
  6472. case LLM_ARCH_PANGU_EMBED:
  6473. case LLM_ARCH_AFMOE:
  6474. return LLAMA_ROPE_TYPE_NEOX;
  6475. case LLM_ARCH_QWEN2VL:
  6476. return LLAMA_ROPE_TYPE_MROPE;
  6477. case LLM_ARCH_QWEN3VL:
  6478. case LLM_ARCH_QWEN3VLMOE:
  6479. return LLAMA_ROPE_TYPE_IMROPE;
  6480. // all model arches should be listed explicitly here
  6481. case LLM_ARCH_UNKNOWN:
  6482. GGML_ABORT("unknown architecture");
  6483. }
  6484. return LLAMA_ROPE_TYPE_NONE;
  6485. }
  6486. float llama_model_rope_freq_scale_train(const llama_model * model) {
  6487. return model->hparams.rope_freq_scale_train;
  6488. }
  6489. int32_t llama_model_meta_val_str(const llama_model * model, const char * key, char * buf, size_t buf_size) {
  6490. const auto & it = model->gguf_kv.find(key);
  6491. if (it == model->gguf_kv.end()) {
  6492. if (buf_size > 0) {
  6493. buf[0] = '\0';
  6494. }
  6495. return -1;
  6496. }
  6497. return snprintf(buf, buf_size, "%s", it->second.c_str());
  6498. }
  6499. int32_t llama_model_meta_count(const llama_model * model) {
  6500. return (int)model->gguf_kv.size();
  6501. }
  6502. const char * llama_model_meta_key_str(llama_model_meta_key key) {
  6503. switch (key) {
  6504. case LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE: return "general.sampling.sequence";
  6505. case LLAMA_MODEL_META_KEY_SAMPLING_TOP_K: return "general.sampling.top_k";
  6506. case LLAMA_MODEL_META_KEY_SAMPLING_TOP_P: return "general.sampling.top_p";
  6507. case LLAMA_MODEL_META_KEY_SAMPLING_MIN_P: return "general.sampling.min_p";
  6508. case LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY: return "general.sampling.xtc_probability";
  6509. case LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD: return "general.sampling.xtc_threshold";
  6510. case LLAMA_MODEL_META_KEY_SAMPLING_TEMP: return "general.sampling.temp";
  6511. case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N: return "general.sampling.penalty_last_n";
  6512. case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT: return "general.sampling.penalty_repeat";
  6513. case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT: return "general.sampling.mirostat";
  6514. case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU: return "general.sampling.mirostat_tau";
  6515. case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA: return "general.sampling.mirostat_eta";
  6516. default: return nullptr;
  6517. }
  6518. }
  6519. int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
  6520. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  6521. if (buf_size > 0) {
  6522. buf[0] = '\0';
  6523. }
  6524. return -1;
  6525. }
  6526. auto it = model->gguf_kv.begin();
  6527. std::advance(it, i);
  6528. return snprintf(buf, buf_size, "%s", it->first.c_str());
  6529. }
  6530. int32_t llama_model_meta_val_str_by_index(const llama_model * model, int32_t i, char * buf, size_t buf_size) {
  6531. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  6532. if (buf_size > 0) {
  6533. buf[0] = '\0';
  6534. }
  6535. return -1;
  6536. }
  6537. auto it = model->gguf_kv.begin();
  6538. std::advance(it, i);
  6539. return snprintf(buf, buf_size, "%s", it->second.c_str());
  6540. }
  6541. int32_t llama_model_desc(const llama_model * model, char * buf, size_t buf_size) {
  6542. return snprintf(buf, buf_size, "%s", model->desc().c_str());
  6543. }
  6544. uint64_t llama_model_size(const llama_model * model) {
  6545. return model->size();
  6546. }
  6547. const char * llama_model_chat_template(const llama_model * model, const char * name) {
  6548. const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
  6549. : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
  6550. const auto & it = model->gguf_kv.find(key);
  6551. if (it == model->gguf_kv.end()) {
  6552. // one-off fix for very popular models (so we are not flooded with issues)
  6553. // do not extend this list unless absolutely necessary
  6554. // Mistral-Small-2503 does not have built-in chat template
  6555. llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
  6556. if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
  6557. return "mistral-v7-tekken";
  6558. }
  6559. return nullptr;
  6560. }
  6561. return it->second.c_str();
  6562. }
  6563. uint64_t llama_model_n_params(const llama_model * model) {
  6564. return model->n_elements();
  6565. }
  6566. bool llama_model_has_encoder(const llama_model * model) {
  6567. switch (model->arch) {
  6568. case LLM_ARCH_T5: return true;
  6569. case LLM_ARCH_T5ENCODER: return true;
  6570. default: return false;
  6571. }
  6572. }
  6573. bool llama_model_has_decoder(const llama_model * model) {
  6574. switch (model->arch) {
  6575. case LLM_ARCH_T5ENCODER: return false;
  6576. default: return true;
  6577. }
  6578. }
  6579. llama_token llama_model_decoder_start_token(const llama_model * model) {
  6580. return model->hparams.dec_start_token_id;
  6581. }
  6582. bool llama_model_is_recurrent(const llama_model * model) {
  6583. return llm_arch_is_recurrent(model->arch);
  6584. }
  6585. bool llama_model_is_hybrid(const llama_model * model) {
  6586. return llm_arch_is_hybrid(model->arch);
  6587. }
  6588. bool llama_model_is_diffusion(const llama_model * model) {
  6589. return llm_arch_is_diffusion(model->arch);
  6590. }
  6591. const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
  6592. return model->tensors_by_name;
  6593. }