llama-model.cpp 399 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307
  1. #include "llama-model.h"
  2. #include "llama-impl.h"
  3. #include "llama-mmap.h"
  4. #include "llama-batch.h"
  5. #include "llama-cparams.h"
  6. #include "llama-model-loader.h"
  7. #include "llama-kv-cache.h"
  8. #include "llama-kv-cache-iswa.h"
  9. #include "llama-memory-hybrid.h"
  10. #include "llama-memory-recurrent.h"
  11. #include "ggml-cpp.h"
  12. #include "models/llm_graph_context_mamba.h"
  13. #include "models/llm_build_arcee.h"
  14. #include "models/llm_build_arctic.h"
  15. #include "models/llm_build_baichuan.h"
  16. #include "models/llm_build_bailingmoe.h"
  17. #include "models/llm_build_bert.h"
  18. #include "models/llm_build_bitnet.h"
  19. #include "models/llm_build_bloom.h"
  20. #include "models/llm_build_chameleon.h"
  21. #include "models/llm_build_chatglm.h"
  22. #include "models/llm_build_codeshell.h"
  23. #include "models/llm_build_cohere2_iswa.h"
  24. #include "models/llm_build_command_r.h"
  25. #include "models/llm_build_dbrx.h"
  26. #include "models/llm_build_deci.h"
  27. #include "models/llm_build_deepseek.h"
  28. #include "models/llm_build_deepseek2.h"
  29. #include "models/llm_build_dots1.h"
  30. #include "models/llm_build_dream.h"
  31. #include "models/llm_build_ernie4_5.h"
  32. #include "models/llm_build_ernie4_5_moe.h"
  33. #include "models/llm_build_exaone.h"
  34. #include "models/llm_build_falcon.h"
  35. #include "models/llm_build_falcon_h1.h"
  36. #include "models/llm_build_gemma.h"
  37. #include "models/llm_build_gemma2_iswa.h"
  38. #include "models/llm_build_gemma3_iswa.h"
  39. #include "models/llm_build_gemma3n_iswa.h"
  40. #include "models/llm_build_gemma_embedding_iswa.h"
  41. #include "models/llm_build_glm4.h"
  42. #include "models/llm_build_glm4_moe.h"
  43. #include "models/llm_build_gpt2.h"
  44. #include "models/llm_build_gptneox.h"
  45. #include "models/llm_build_granite.h"
  46. #include "models/llm_build_granite_hybrid.h"
  47. #include "models/llm_build_grok.h"
  48. #include "models/llm_build_hunyuan_dense.h"
  49. #include "models/llm_build_hunyuan_moe.h"
  50. #include "models/llm_build_internlm2.h"
  51. #include "models/llm_build_jais.h"
  52. #include "models/llm_build_jamba.h"
  53. #include "models/llm_build_lfm2.h"
  54. #include "models/llm_build_llada.h"
  55. #include "models/llm_build_llada_moe.h"
  56. #include "models/llm_build_llama.h"
  57. #include "models/llm_build_llama_iswa.h"
  58. #include "models/llm_build_mamba.h"
  59. #include "models/llm_build_minicpm3.h"
  60. #include "models/llm_build_mpt.h"
  61. #include "models/llm_build_nemotron.h"
  62. #include "models/llm_build_nemotron_h.h"
  63. #include "models/llm_build_neo_bert.h"
  64. #include "models/llm_build_olmo.h"
  65. #include "models/llm_build_olmoe.h"
  66. #include "models/llm_build_openai_moe_iswa.h"
  67. #include "models/llm_build_openelm.h"
  68. #include "models/llm_build_orion.h"
  69. #include "models/llm_build_phi2.h"
  70. #include "models/llm_build_plamo.h"
  71. #include "models/llm_build_plamo2.h"
  72. #include "models/llm_build_plm.h"
  73. #include "models/llm_build_qwen.h"
  74. #include "models/llm_build_qwen2.h"
  75. #include "models/llm_build_qwen2moe.h"
  76. #include "models/llm_build_qwen2vl.h"
  77. #include "models/llm_build_qwen3.h"
  78. #include "models/llm_build_qwen3moe.h"
  79. #include "models/llm_build_qwen3next.h"
  80. #include "models/llm_build_refact.h"
  81. #include "models/llm_build_rwkv_base.h"
  82. #include "models/llm_build_rwkv6.h"
  83. #include "models/llm_build_rwkv6qwen2.h"
  84. #include "models/llm_build_rwkv7.h"
  85. #include "models/llm_build_arwkv7.h"
  86. #include "models/llm_build_seed_oss.h"
  87. #include "models/llm_build_smollm3.h"
  88. #include "models/llm_build_stablelm.h"
  89. #include "models/llm_build_starcoder.h"
  90. #include "models/llm_build_starcoder2.h"
  91. #include "models/llm_build_t5_dec.h"
  92. #include "models/llm_build_t5_enc.h"
  93. #include "models/llm_build_wavtokenizer_dec.h"
  94. #include "models/llm_build_xverse.h"
  95. #include "models/llm_build_exaone4.h"
  96. #include "models/llm_build_olmo2.h"
  97. #include "models/llm_build_smallthinker.h"
  98. #include "models/llm_build_phi3.h"
  99. #include <algorithm>
  100. #include <cassert>
  101. #include <cmath>
  102. #include <cfloat>
  103. #include <cstring>
  104. #include <cmath>
  105. #include <functional>
  106. #include <map>
  107. #include <regex>
  108. #include <sstream>
  109. #include <stdexcept>
  110. const char * llm_type_name(llm_type type) {
  111. switch (type) {
  112. case LLM_TYPE_14M: return "14M";
  113. case LLM_TYPE_17M: return "17M";
  114. case LLM_TYPE_22M: return "22M";
  115. case LLM_TYPE_33M: return "33M";
  116. case LLM_TYPE_60M: return "60M";
  117. case LLM_TYPE_70M: return "70M";
  118. case LLM_TYPE_80M: return "80M";
  119. case LLM_TYPE_109M: return "109M";
  120. case LLM_TYPE_137M: return "137M";
  121. case LLM_TYPE_140M: return "140M";
  122. case LLM_TYPE_160M: return "160M";
  123. case LLM_TYPE_190M: return "190M";
  124. case LLM_TYPE_220M: return "220M";
  125. case LLM_TYPE_250M: return "250M";
  126. case LLM_TYPE_256M: return "256M";
  127. case LLM_TYPE_270M: return "270M";
  128. case LLM_TYPE_335M: return "335M";
  129. case LLM_TYPE_350M: return "350M";
  130. case LLM_TYPE_360M: return "360M";
  131. case LLM_TYPE_410M: return "410M";
  132. case LLM_TYPE_450M: return "450M";
  133. case LLM_TYPE_475M: return "475M";
  134. case LLM_TYPE_558M: return "558M";
  135. case LLM_TYPE_700M: return "700M";
  136. case LLM_TYPE_770M: return "770M";
  137. case LLM_TYPE_780M: return "780M";
  138. case LLM_TYPE_950M: return "950M";
  139. case LLM_TYPE_0_3B: return "0.3B";
  140. case LLM_TYPE_0_5B: return "0.5B";
  141. case LLM_TYPE_0_6B: return "0.6B";
  142. case LLM_TYPE_1B: return "1B";
  143. case LLM_TYPE_1_2B: return "1.2B";
  144. case LLM_TYPE_1_3B: return "1.3B";
  145. case LLM_TYPE_1_4B: return "1.4B";
  146. case LLM_TYPE_1_5B: return "1.5B";
  147. case LLM_TYPE_1_6B: return "1.6B";
  148. case LLM_TYPE_1_7B: return "1.7B";
  149. case LLM_TYPE_1_8B: return "1.8B";
  150. case LLM_TYPE_2B: return "2B";
  151. case LLM_TYPE_2_8B: return "2.8B";
  152. case LLM_TYPE_2_9B: return "2.9B";
  153. case LLM_TYPE_3B: return "3B";
  154. case LLM_TYPE_4B: return "4B";
  155. case LLM_TYPE_6B: return "6B";
  156. case LLM_TYPE_6_9B: return "6.9B";
  157. case LLM_TYPE_7B: return "7B";
  158. case LLM_TYPE_8B: return "8B";
  159. case LLM_TYPE_9B: return "9B";
  160. case LLM_TYPE_11B: return "11B";
  161. case LLM_TYPE_12B: return "12B";
  162. case LLM_TYPE_13B: return "13B";
  163. case LLM_TYPE_14B: return "14B";
  164. case LLM_TYPE_15B: return "15B";
  165. case LLM_TYPE_16B: return "16B";
  166. case LLM_TYPE_20B: return "20B";
  167. case LLM_TYPE_27B: return "27B";
  168. case LLM_TYPE_30B: return "30B";
  169. case LLM_TYPE_32B: return "32B";
  170. case LLM_TYPE_34B: return "34B";
  171. case LLM_TYPE_35B: return "35B";
  172. case LLM_TYPE_36B: return "36B";
  173. case LLM_TYPE_40B: return "40B";
  174. case LLM_TYPE_65B: return "65B";
  175. case LLM_TYPE_70B: return "70B";
  176. case LLM_TYPE_120B: return "120B";
  177. case LLM_TYPE_142B: return "142B";
  178. case LLM_TYPE_236B: return "236B";
  179. case LLM_TYPE_290B: return "290B";
  180. case LLM_TYPE_314B: return "314B";
  181. case LLM_TYPE_405B: return "405B";
  182. case LLM_TYPE_671B: return "671B";
  183. case LLM_TYPE_SMALL: return "0.1B";
  184. case LLM_TYPE_MEDIUM: return "0.4B";
  185. case LLM_TYPE_LARGE: return "0.8B";
  186. case LLM_TYPE_XL: return "1.5B";
  187. case LLM_TYPE_A1_7B: return "A1.7B";
  188. case LLM_TYPE_A2_7B: return "A2.7B";
  189. case LLM_TYPE_8x7B: return "8x7B";
  190. case LLM_TYPE_8x22B: return "8x22B";
  191. case LLM_TYPE_16x12B: return "16x12B";
  192. case LLM_TYPE_16x3_8B: return "16x3.8B";
  193. case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
  194. case LLM_TYPE_57B_A14B: return "57B.A14B";
  195. case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
  196. case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
  197. case LLM_TYPE_A13B: return "A13B";
  198. case LLM_TYPE_21B_A3B: return "21B.A3B";
  199. case LLM_TYPE_30B_A3B: return "30B.A3B";
  200. case LLM_TYPE_80B_A3B: return "80B.A3B";
  201. case LLM_TYPE_106B_A12B: return "106B.A12B";
  202. case LLM_TYPE_235B_A22B: return "235B.A22B";
  203. case LLM_TYPE_300B_A47B: return "300B.A47B";
  204. case LLM_TYPE_355B_A32B: return "355B.A32B";
  205. case LLM_TYPE_E2B: return "E2B";
  206. case LLM_TYPE_E4B: return "E4B";
  207. default: return "?B";
  208. }
  209. }
  210. static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
  211. switch (type) {
  212. case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
  213. case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
  214. default: return "unknown";
  215. }
  216. }
  217. static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
  218. { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
  219. { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
  220. { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
  221. { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
  222. };
  223. std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
  224. return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
  225. }
  226. static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
  227. for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
  228. if (kv.second == name) {
  229. return (llama_rope_scaling_type) kv.first;
  230. }
  231. }
  232. return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
  233. }
  234. // checks if the weight tensor can be used with the specified buffer type and device
  235. static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
  236. GGML_ASSERT(w != nullptr);
  237. if (op == GGML_OP_NONE) {
  238. return true;
  239. }
  240. ggml_init_params params = {
  241. /*.mem_size =*/ ggml_tensor_overhead()*8,
  242. /*.mem_buffer =*/ NULL,
  243. /*.no_alloc =*/ true,
  244. };
  245. ggml_context_ptr ctx_ptr { ggml_init(params) };
  246. if (!ctx_ptr) {
  247. throw std::runtime_error(format("failed to create ggml context"));
  248. }
  249. ggml_context * ctx = ctx_ptr.get();
  250. ggml_tensor * op_tensor = nullptr;
  251. switch (op) {
  252. case GGML_OP_GET_ROWS:
  253. {
  254. ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
  255. op_tensor = ggml_get_rows(ctx, w, b);
  256. } break;
  257. case GGML_OP_MUL_MAT:
  258. {
  259. ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
  260. op_tensor = ggml_mul_mat(ctx, w, b);
  261. } break;
  262. case GGML_OP_MUL_MAT_ID:
  263. {
  264. int n_expert_used = hparams.n_expert_used;
  265. ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
  266. ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
  267. op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
  268. } break;
  269. case GGML_OP_ADD:
  270. {
  271. ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
  272. op_tensor = ggml_add(ctx, a, w);
  273. } break;
  274. case GGML_OP_ADD_ID:
  275. {
  276. int n_expert_used = hparams.n_expert_used;
  277. ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
  278. ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
  279. op_tensor = ggml_add_id(ctx, a, w, c);
  280. } break;
  281. case GGML_OP_MUL:
  282. {
  283. ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
  284. op_tensor = ggml_mul(ctx, a, w);
  285. } break;
  286. case GGML_OP_DIV:
  287. {
  288. ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
  289. op_tensor = ggml_div(ctx, a, w);
  290. } break;
  291. case GGML_OP_ROPE:
  292. {
  293. int n_embd_head = hparams.n_embd_head_v;
  294. int n_head = hparams.n_head();
  295. ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
  296. ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
  297. op_tensor = ggml_rope_ext(
  298. ctx, a, b, w,
  299. 0, 0, 0, 0, 0,
  300. 0, 0, 0, 0
  301. );
  302. } break;
  303. case GGML_OP_SSM_CONV:
  304. {
  305. const int64_t n_seq_tokens = 512;
  306. const int64_t n_seqs = 3;
  307. ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
  308. op_tensor = ggml_ssm_conv(ctx, conv_x, w);
  309. } break;
  310. case GGML_OP_SSM_SCAN:
  311. {
  312. // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
  313. const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
  314. const int64_t n_head = w->ne[1];
  315. const int64_t head_dim = hparams.ssm_d_inner / n_head;
  316. const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
  317. const int64_t n_seq_tokens = 512;
  318. const int64_t n_seqs = 3;
  319. ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
  320. ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
  321. ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
  322. ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
  323. ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
  324. ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
  325. op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
  326. } break;
  327. case GGML_OP_RWKV_WKV6:
  328. {
  329. // FIXME
  330. const int64_t S = 123;
  331. const int64_t H = 123;
  332. const int64_t n_tokens = 123;
  333. const int64_t n_seqs = 123;
  334. ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  335. ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  336. ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  337. ggml_tensor * tf = w;
  338. ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  339. ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
  340. op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
  341. } break;
  342. case GGML_OP_IM2COL:
  343. {
  344. const int n_embd = hparams.n_embd;
  345. ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
  346. op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
  347. } break;
  348. case GGML_OP_SCALE:
  349. {
  350. op_tensor = ggml_scale(ctx, w, 1.0f);
  351. } break;
  352. default:
  353. GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
  354. }
  355. // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
  356. GGML_ASSERT(w->buffer == nullptr);
  357. w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
  358. bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  359. ggml_backend_buffer_free(w->buffer);
  360. w->buffer = nullptr;
  361. return op_supported;
  362. }
  363. // lists of buffer types used for each layer
  364. using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
  365. // find the first buffer type in the list that can use the tensor
  366. static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
  367. GGML_ASSERT(!buft_list.empty());
  368. for (const auto & cur : buft_list) {
  369. ggml_backend_dev_t cur_dev = cur.first;
  370. ggml_backend_buffer_type_t cur_buft = cur.second;
  371. if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
  372. return cur_buft;
  373. }
  374. }
  375. return nullptr;
  376. }
  377. // CPU: ACCEL -> GPU host -> CPU extra -> CPU
  378. static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts) {
  379. buft_list_t buft_list;
  380. // add ACCEL buffer types
  381. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  382. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  383. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
  384. auto * buft = ggml_backend_dev_buffer_type(dev);
  385. // skip
  386. if (buft != ggml_backend_cpu_buffer_type()) {
  387. buft_list.emplace_back(dev, buft);
  388. }
  389. }
  390. }
  391. // add a host buffer type
  392. // storing the tensors in a host buffer is useful when the processing of large batches
  393. // is offloaded to a GPU device, since it reduces the time spent on data transfers
  394. // generally, this will be done using the first device in the list
  395. // a better approach would be to handle this on a weight-by-weight basis using the offload_op
  396. // function of the device to determine if it would benefit from being stored in a host buffer
  397. for (auto * dev : devices) {
  398. ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
  399. if (buft) {
  400. buft_list.emplace_back(dev, buft);
  401. break;
  402. }
  403. }
  404. // add extra buffer types
  405. if (use_extra_bufts) {
  406. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  407. if (cpu_dev == nullptr) {
  408. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  409. }
  410. auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
  411. auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
  412. ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
  413. if (ggml_backend_dev_get_extra_bufts_fn) {
  414. ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
  415. while (extra_bufts && *extra_bufts) {
  416. buft_list.emplace_back(cpu_dev, *extra_bufts);
  417. ++extra_bufts;
  418. }
  419. }
  420. }
  421. // add the CPU buffer type
  422. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  423. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  424. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
  425. buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
  426. }
  427. }
  428. return buft_list;
  429. }
  430. // GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
  431. static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode split_mode, const float * tensor_split) {
  432. buft_list_t buft_list;
  433. // add the device split buffer type if requested and available
  434. if (split_mode == LLAMA_SPLIT_MODE_ROW) {
  435. ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
  436. auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
  437. ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
  438. if (ggml_backend_split_buffer_type_fn) {
  439. size_t dev_index = [&]() {
  440. auto * reg = ggml_backend_dev_backend_reg(dev);
  441. for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
  442. if (ggml_backend_reg_dev_get(reg, i) == dev) {
  443. return i;
  444. }
  445. }
  446. throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
  447. }();
  448. auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
  449. if (buft != nullptr) {
  450. buft_list.emplace_back(dev, buft);
  451. }
  452. }
  453. }
  454. // add the device default buffer type
  455. buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
  456. return buft_list;
  457. }
  458. struct llama_model::impl {
  459. impl() {}
  460. ~impl() {}
  461. uint64_t n_elements = 0;
  462. size_t n_bytes = 0;
  463. std::string desc_str;
  464. // model memory mapped files
  465. llama_mmaps mappings;
  466. // objects representing data potentially being locked in memory
  467. llama_mlocks mlock_bufs;
  468. llama_mlocks mlock_mmaps;
  469. // contexts where the model tensors metadata is stored
  470. std::vector<ggml_context_ptr> ctxs;
  471. // the model memory buffers for the tensor data
  472. std::vector<ggml_backend_buffer_ptr> bufs;
  473. buft_list_t cpu_buft_list;
  474. std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
  475. struct layer_dev {
  476. ggml_backend_dev_t dev;
  477. buft_list_t * buft_list;
  478. };
  479. layer_dev dev_input = {};
  480. layer_dev dev_output = {};
  481. std::vector<layer_dev> dev_layer;
  482. bool has_tensor_overrides;
  483. };
  484. llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
  485. pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
  486. }
  487. llama_model::~llama_model() {}
  488. void llama_model::load_stats(llama_model_loader & ml) {
  489. pimpl->n_elements = ml.n_elements;
  490. pimpl->n_bytes = ml.n_bytes;
  491. }
  492. void llama_model::load_arch(llama_model_loader & ml) {
  493. arch = ml.get_arch();
  494. if (arch == LLM_ARCH_UNKNOWN) {
  495. throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
  496. }
  497. }
  498. void llama_model::load_hparams(llama_model_loader & ml) {
  499. const gguf_context * ctx = ml.meta.get();
  500. // get metadata as string
  501. for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
  502. gguf_type type = gguf_get_kv_type(ctx, i);
  503. if (type == GGUF_TYPE_ARRAY) {
  504. continue;
  505. }
  506. const char * name = gguf_get_key(ctx, i);
  507. const std::string value = gguf_kv_to_str(ctx, i);
  508. gguf_kv.emplace(name, value);
  509. }
  510. // get general kv
  511. ml.get_key(LLM_KV_GENERAL_NAME, name, false);
  512. // everything past this point is not vocab-related
  513. if (hparams.vocab_only) {
  514. return;
  515. }
  516. ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
  517. ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
  518. ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
  519. ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
  520. ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
  521. if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
  522. ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
  523. ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
  524. ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
  525. ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
  526. ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
  527. }
  528. GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
  529. GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
  530. if (hparams.n_expert > 0) {
  531. GGML_ASSERT(hparams.n_expert_used > 0);
  532. } else {
  533. GGML_ASSERT(hparams.n_expert_used == 0);
  534. }
  535. std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
  536. std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
  537. std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
  538. std::fill(
  539. hparams.recurrent_layer_arr.begin(),
  540. hparams.recurrent_layer_arr.end(),
  541. llm_arch_is_recurrent(ml.get_arch()));
  542. std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
  543. std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
  544. ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
  545. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
  546. // n_head_kv is optional, default to n_head
  547. hparams.n_head_kv_arr = hparams.n_head_arr;
  548. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
  549. bool rope_finetuned = false;
  550. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  551. hparams.rope_finetuned = rope_finetuned;
  552. hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
  553. ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
  554. // rope_freq_base (optional)
  555. hparams.rope_freq_base_train = 10000.0f;
  556. ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
  557. std::string rope_scaling("linear");
  558. ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
  559. hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
  560. GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
  561. // rope_freq_scale (inverse of the kv) is optional
  562. float ropescale = 0.0f;
  563. if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
  564. // try the old key name
  565. ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
  566. }
  567. hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
  568. // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
  569. hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
  570. hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
  571. ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
  572. // non-transformer models do not have attention heads
  573. if (hparams.n_head() > 0) {
  574. // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
  575. // gpt-j n_rot = rotary_dim
  576. hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
  577. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
  578. hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
  579. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
  580. // sanity check for n_rot (optional)
  581. hparams.n_rot = hparams.n_embd_head_k;
  582. ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
  583. if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
  584. if (hparams.n_rot != hparams.n_embd_head_k) {
  585. throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
  586. }
  587. }
  588. } else {
  589. hparams.n_rot = 0;
  590. hparams.n_embd_head_k = 0;
  591. hparams.n_embd_head_v = 0;
  592. }
  593. // for differentiating model types
  594. uint32_t n_vocab = 0;
  595. ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
  596. // for classifier models
  597. ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
  598. if (!classifier_labels.empty()) {
  599. hparams.n_cls_out = classifier_labels.size();
  600. }
  601. // arch-specific KVs
  602. switch (arch) {
  603. case LLM_ARCH_LLAMA:
  604. {
  605. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  606. if (hparams.n_expert == 8) {
  607. switch (hparams.n_layer) {
  608. case 32: type = LLM_TYPE_8x7B; break;
  609. case 56: type = LLM_TYPE_8x22B; break;
  610. default: type = LLM_TYPE_UNKNOWN;
  611. }
  612. } else {
  613. switch (hparams.n_layer) {
  614. case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
  615. case 22: type = LLM_TYPE_1B; break;
  616. case 26: type = LLM_TYPE_3B; break;
  617. case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
  618. case 30: type = LLM_TYPE_256M; break; // smoldocling 256M
  619. // granite uses a vocab with len 49152
  620. case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
  621. case 36: type = LLM_TYPE_8B; break; // granite
  622. case 40: type = LLM_TYPE_13B; break;
  623. case 48: type = LLM_TYPE_34B; break;
  624. case 60: type = LLM_TYPE_30B; break;
  625. case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
  626. default: type = LLM_TYPE_UNKNOWN;
  627. }
  628. }
  629. } break;
  630. case LLM_ARCH_LLAMA4:
  631. {
  632. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  633. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  634. ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
  635. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  636. if (found_swa && hparams.n_swa == 0) {
  637. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  638. hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
  639. } else {
  640. hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
  641. hparams.n_swa = 8192;
  642. hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
  643. }
  644. switch (hparams.n_expert) {
  645. case 0: {
  646. // MobileLLM (no MoE)
  647. switch (hparams.n_embd) {
  648. case 2048: type = LLM_TYPE_140M; break;
  649. case 4096: type = LLM_TYPE_360M; break;
  650. case 6144: type = LLM_TYPE_950M; break;
  651. default: type = LLM_TYPE_UNKNOWN;
  652. }
  653. } break;
  654. case 16: type = LLM_TYPE_17B_16E; break;
  655. case 128: type = LLM_TYPE_17B_128E; break;
  656. default: type = LLM_TYPE_UNKNOWN;
  657. }
  658. hparams.use_kq_norm = type != LLM_TYPE_17B_128E;
  659. } break;
  660. case LLM_ARCH_ARCEE:
  661. {
  662. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  663. // Arcee uses the same structure as Llama
  664. switch (hparams.n_layer) {
  665. case 36: type = LLM_TYPE_4B; break;
  666. default: type = LLM_TYPE_UNKNOWN;
  667. }
  668. } break;
  669. case LLM_ARCH_DECI:
  670. {
  671. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  672. switch (hparams.n_layer) {
  673. case 32: type = LLM_TYPE_7B; break;
  674. case 80: type = LLM_TYPE_70B; break;
  675. case 162: type = LLM_TYPE_405B; break;
  676. default: type = LLM_TYPE_UNKNOWN;
  677. }
  678. } break;
  679. case LLM_ARCH_MINICPM:
  680. {
  681. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  682. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  683. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  684. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  685. // MiniCPM uses rope by default, unlike Granite which uses it as a switch
  686. hparams.rope_finetuned = true;
  687. switch (hparams.n_layer) {
  688. case 52: type = LLM_TYPE_1B; break;
  689. case 40: type = LLM_TYPE_2B; break;
  690. default: type = LLM_TYPE_UNKNOWN;
  691. }
  692. } break;
  693. case LLM_ARCH_MINICPM3:
  694. {
  695. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  696. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  697. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  698. switch (hparams.n_layer) {
  699. case 62: type = LLM_TYPE_4B; break;
  700. default: type = LLM_TYPE_UNKNOWN;
  701. }
  702. } break;
  703. case LLM_ARCH_GROK:
  704. {
  705. // defaults for old GGUFs
  706. hparams.yarn_beta_fast = 8.0f;
  707. hparams.f_logit_scale = 0.5773502691896257f;
  708. hparams.f_embedding_scale = 78.38367176906169f;
  709. hparams.f_attn_out_scale = 0.08838834764831845f;
  710. hparams.f_attn_logit_softcapping = 30.0f;
  711. hparams.f_router_logit_softcapping = 30.0f;
  712. // no final_logit_softcapping in grok-1
  713. hparams.f_final_logit_softcapping = 0.0f;
  714. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  715. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  716. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false);
  717. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false);
  718. ml.get_key(LLM_KV_ATTENTION_OUTPUT_SCALE, hparams.f_attn_out_scale, false);
  719. ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
  720. ml.get_key(LLM_KV_ROUTER_LOGIT_SOFTCAPPING, hparams.f_router_logit_softcapping, false);
  721. ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
  722. ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.attn_temp_length, false);
  723. ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor, false);
  724. ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
  725. ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
  726. ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
  727. switch (hparams.n_layer) {
  728. case 64: type = LLM_TYPE_314B; break;
  729. default: type = LLM_TYPE_UNKNOWN;
  730. }
  731. } break;
  732. case LLM_ARCH_FALCON:
  733. {
  734. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  735. switch (hparams.n_layer) {
  736. case 32: type = LLM_TYPE_7B; break;
  737. case 60: type = LLM_TYPE_40B; break;
  738. default: type = LLM_TYPE_UNKNOWN;
  739. }
  740. } break;
  741. case LLM_ARCH_BAICHUAN:
  742. {
  743. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  744. switch (hparams.n_layer) {
  745. case 32: type = LLM_TYPE_7B; break;
  746. case 40: type = LLM_TYPE_13B; break;
  747. default: type = LLM_TYPE_UNKNOWN;
  748. }
  749. if (type == LLM_TYPE_13B) {
  750. // TODO: become GGUF KV parameter
  751. hparams.f_max_alibi_bias = 8.0f;
  752. }
  753. } break;
  754. case LLM_ARCH_STARCODER:
  755. {
  756. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  757. switch (hparams.n_layer) {
  758. case 24: type = LLM_TYPE_1B; break;
  759. case 36: type = LLM_TYPE_3B; break;
  760. case 42: type = LLM_TYPE_7B; break;
  761. case 40: type = LLM_TYPE_15B; break;
  762. default: type = LLM_TYPE_UNKNOWN;
  763. }
  764. } break;
  765. case LLM_ARCH_REFACT:
  766. {
  767. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  768. switch (hparams.n_layer) {
  769. case 32: type = LLM_TYPE_1B; break;
  770. default: type = LLM_TYPE_UNKNOWN;
  771. }
  772. // TODO: become GGUF KV parameter
  773. hparams.f_max_alibi_bias = 8.0f;
  774. } break;
  775. case LLM_ARCH_BERT:
  776. {
  777. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  778. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  779. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  780. switch (hparams.n_layer) {
  781. case 3:
  782. type = LLM_TYPE_17M; break; // bge-micro
  783. case 6:
  784. type = LLM_TYPE_22M; break; // MiniLM-L6
  785. case 12:
  786. switch (hparams.n_embd) {
  787. case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
  788. case 768: type = LLM_TYPE_109M; break; // bge-base
  789. default: type = LLM_TYPE_UNKNOWN;
  790. } break;
  791. case 24:
  792. type = LLM_TYPE_335M; break; // bge-large
  793. default: type = LLM_TYPE_UNKNOWN;
  794. }
  795. } break;
  796. case LLM_ARCH_JINA_BERT_V2:
  797. {
  798. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  799. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  800. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  801. hparams.f_max_alibi_bias = 8.0f;
  802. switch (hparams.n_layer) {
  803. case 4: type = LLM_TYPE_33M; break; // jina-embeddings-small
  804. case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
  805. default: type = LLM_TYPE_UNKNOWN;
  806. }
  807. } break;
  808. case LLM_ARCH_JINA_BERT_V3:
  809. {
  810. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  811. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  812. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  813. switch (hparams.n_layer) {
  814. case 24:
  815. type = LLM_TYPE_558M; break;
  816. default: type = LLM_TYPE_UNKNOWN;
  817. }
  818. } break;
  819. case LLM_ARCH_NOMIC_BERT:
  820. case LLM_ARCH_NOMIC_BERT_MOE:
  821. {
  822. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  823. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  824. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  825. ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
  826. if (hparams.n_layer == 12 && hparams.n_embd == 768) {
  827. if (arch == LLM_ARCH_NOMIC_BERT) {
  828. type = LLM_TYPE_137M;
  829. } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
  830. type = LLM_TYPE_475M;
  831. }
  832. }
  833. } break;
  834. case LLM_ARCH_NEO_BERT:
  835. {
  836. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  837. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  838. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  839. if (hparams.n_layer == 28) {
  840. type = LLM_TYPE_250M;
  841. }
  842. } break;
  843. case LLM_ARCH_BLOOM:
  844. {
  845. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  846. switch (hparams.n_layer) {
  847. case 24: type = LLM_TYPE_1B; break;
  848. case 30:
  849. switch (hparams.n_embd) {
  850. case 2560: type = LLM_TYPE_3B; break;
  851. case 4096: type = LLM_TYPE_7B; break;
  852. default: type = LLM_TYPE_UNKNOWN;
  853. } break;
  854. default: type = LLM_TYPE_UNKNOWN;
  855. }
  856. // TODO: become GGUF KV parameter
  857. hparams.f_max_alibi_bias = 8.0f;
  858. } break;
  859. case LLM_ARCH_MPT:
  860. {
  861. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  862. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  863. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  864. switch (hparams.n_layer) {
  865. case 32: type = LLM_TYPE_7B; break;
  866. case 48: type = LLM_TYPE_30B; break;
  867. default: type = LLM_TYPE_UNKNOWN;
  868. }
  869. } break;
  870. case LLM_ARCH_STABLELM:
  871. {
  872. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  873. switch (hparams.n_layer) {
  874. case 24: type = LLM_TYPE_1B; break;
  875. case 32: type = LLM_TYPE_3B; break;
  876. case 40: type = LLM_TYPE_12B; break;
  877. default: type = LLM_TYPE_UNKNOWN;
  878. }
  879. } break;
  880. case LLM_ARCH_QWEN:
  881. {
  882. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  883. switch (hparams.n_layer) {
  884. case 32: type = LLM_TYPE_7B; break;
  885. case 40: type = LLM_TYPE_13B; break;
  886. default: type = LLM_TYPE_UNKNOWN;
  887. }
  888. } break;
  889. case LLM_ARCH_QWEN2VL:
  890. {
  891. ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
  892. }
  893. // fall through
  894. case LLM_ARCH_QWEN2:
  895. {
  896. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  897. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  898. switch (hparams.n_layer) {
  899. case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
  900. case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
  901. case 32: type = LLM_TYPE_7B; break;
  902. case 36: type = LLM_TYPE_3B; break;
  903. case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
  904. case 48: type = LLM_TYPE_14B; break;
  905. case 64: type = LLM_TYPE_32B; break;
  906. case 80: type = LLM_TYPE_70B; break;
  907. default: type = LLM_TYPE_UNKNOWN;
  908. }
  909. } break;
  910. case LLM_ARCH_DREAM:
  911. {
  912. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  913. // Dream models are primarily 7B with 28 layers
  914. switch (hparams.n_layer) {
  915. case 28:
  916. type = LLM_TYPE_7B;
  917. break;
  918. default:
  919. type = LLM_TYPE_UNKNOWN;
  920. }
  921. // Set non-causal attention for diffusion models
  922. hparams.causal_attn = false;
  923. }
  924. break;
  925. case LLM_ARCH_LLADA:
  926. {
  927. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  928. // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
  929. switch (hparams.n_layer) {
  930. case 32:
  931. type = LLM_TYPE_8B;
  932. break;
  933. default:
  934. type = LLM_TYPE_UNKNOWN;
  935. }
  936. // Set non-causal attention for diffusion models
  937. hparams.causal_attn = false;
  938. }
  939. break;
  940. case LLM_ARCH_LLADA_MOE:
  941. {
  942. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  943. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  944. // diffusion language model uses non-causal attention
  945. hparams.causal_attn = false;
  946. switch (hparams.n_layer) {
  947. case 16: type = LLM_TYPE_A1_7B; break;
  948. default: type = LLM_TYPE_UNKNOWN;
  949. }
  950. } break;
  951. case LLM_ARCH_QWEN2MOE:
  952. {
  953. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  954. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  955. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  956. switch (hparams.n_layer) {
  957. case 24: type = LLM_TYPE_A2_7B; break;
  958. case 28: type = LLM_TYPE_57B_A14B; break;
  959. default: type = LLM_TYPE_UNKNOWN;
  960. }
  961. } break;
  962. case LLM_ARCH_QWEN3:
  963. {
  964. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  965. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  966. switch (hparams.n_layer) {
  967. case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
  968. case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
  969. case 40: type = LLM_TYPE_14B; break;
  970. case 64: type = LLM_TYPE_32B; break;
  971. default: type = LLM_TYPE_UNKNOWN;
  972. }
  973. } break;
  974. case LLM_ARCH_QWEN3MOE:
  975. {
  976. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  977. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  978. switch (hparams.n_layer) {
  979. case 48: type = LLM_TYPE_30B_A3B; break;
  980. case 94: type = LLM_TYPE_235B_A22B; break;
  981. default: type = LLM_TYPE_UNKNOWN;
  982. }
  983. } break;
  984. case LLM_ARCH_PHI2:
  985. {
  986. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  987. switch (hparams.n_layer) {
  988. case 24: type = LLM_TYPE_1B; break;
  989. case 32: type = LLM_TYPE_3B; break;
  990. default: type = LLM_TYPE_UNKNOWN;
  991. }
  992. } break;
  993. case LLM_ARCH_PHI3:
  994. {
  995. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  996. switch (hparams.n_layer) {
  997. case 24: type = LLM_TYPE_1B; break;
  998. case 32: type = LLM_TYPE_3B; break;
  999. case 40: type = LLM_TYPE_14B; break;
  1000. default: type = LLM_TYPE_UNKNOWN;
  1001. }
  1002. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1003. if (found_swa && hparams.n_swa > 0) {
  1004. LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
  1005. __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
  1006. // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
  1007. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  1008. hparams.n_swa = 0;
  1009. hparams.set_swa_pattern(1);
  1010. }
  1011. } break;
  1012. case LLM_ARCH_PHIMOE:
  1013. {
  1014. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1015. switch (hparams.n_layer) {
  1016. case 32: type = LLM_TYPE_16x3_8B; break;
  1017. default: type = LLM_TYPE_UNKNOWN;
  1018. }
  1019. } break;
  1020. case LLM_ARCH_PLAMO:
  1021. {
  1022. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1023. switch (hparams.n_layer) {
  1024. case 40: type = LLM_TYPE_13B; break;
  1025. default: type = LLM_TYPE_UNKNOWN;
  1026. }
  1027. } break;
  1028. case LLM_ARCH_PLAMO2:
  1029. {
  1030. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1031. // Load Mamba SSM parameters
  1032. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1033. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1034. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1035. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1036. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1037. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1038. hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
  1039. }
  1040. switch (hparams.n_layer) {
  1041. case 16: type = LLM_TYPE_1B; break;
  1042. case 32:
  1043. if (hparams.n_embd == 2048) {
  1044. type = LLM_TYPE_2B;
  1045. } else if (hparams.n_embd == 4096) {
  1046. type = LLM_TYPE_8B;
  1047. }
  1048. break;
  1049. default: type = LLM_TYPE_UNKNOWN;
  1050. }
  1051. } break;
  1052. case LLM_ARCH_GPT2:
  1053. {
  1054. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1055. switch (hparams.n_layer) {
  1056. case 12: type = LLM_TYPE_SMALL; break;
  1057. case 24: type = LLM_TYPE_MEDIUM; break;
  1058. case 36: type = LLM_TYPE_LARGE; break;
  1059. case 48: type = LLM_TYPE_XL; break;
  1060. default: type = LLM_TYPE_UNKNOWN;
  1061. }
  1062. } break;
  1063. case LLM_ARCH_CODESHELL:
  1064. {
  1065. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1066. switch (hparams.n_layer) {
  1067. case 42: type = LLM_TYPE_7B; break;
  1068. default: type = LLM_TYPE_UNKNOWN;
  1069. }
  1070. } break;
  1071. case LLM_ARCH_ORION:
  1072. {
  1073. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1074. switch (hparams.n_layer) {
  1075. case 40: type = LLM_TYPE_14B; break;
  1076. default: type = LLM_TYPE_UNKNOWN;
  1077. }
  1078. } break;
  1079. case LLM_ARCH_INTERNLM2:
  1080. {
  1081. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1082. switch (hparams.n_layer) {
  1083. case 32: type = LLM_TYPE_7B; break;
  1084. case 48: type = LLM_TYPE_20B; break;
  1085. default: type = LLM_TYPE_UNKNOWN;
  1086. }
  1087. } break;
  1088. case LLM_ARCH_GEMMA:
  1089. {
  1090. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1091. switch (hparams.n_layer) {
  1092. case 18: type = LLM_TYPE_2B; break;
  1093. case 28: type = LLM_TYPE_7B; break;
  1094. default: type = LLM_TYPE_UNKNOWN;
  1095. }
  1096. } break;
  1097. case LLM_ARCH_GEMMA2:
  1098. {
  1099. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1100. hparams.n_swa = 4096; // default value of gemma 2
  1101. hparams.set_swa_pattern(2);
  1102. hparams.attn_soft_cap = true;
  1103. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1104. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1105. ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
  1106. ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
  1107. switch (hparams.n_layer) {
  1108. case 26: type = LLM_TYPE_2B; break;
  1109. case 42: type = LLM_TYPE_9B; break;
  1110. case 46: type = LLM_TYPE_27B; break;
  1111. default: type = LLM_TYPE_UNKNOWN;
  1112. }
  1113. // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
  1114. hparams.f_attention_scale = type == LLM_TYPE_27B
  1115. ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
  1116. : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
  1117. } break;
  1118. case LLM_ARCH_GEMMA3:
  1119. {
  1120. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1121. hparams.set_swa_pattern(6);
  1122. hparams.rope_freq_base_train_swa = 10000.0f;
  1123. hparams.rope_freq_scale_train_swa = 1.0f;
  1124. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1125. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1126. switch (hparams.n_layer) {
  1127. case 18: type = LLM_TYPE_270M; break;
  1128. case 26: type = LLM_TYPE_1B; break;
  1129. case 34: type = LLM_TYPE_4B; break;
  1130. case 48: type = LLM_TYPE_12B; break;
  1131. case 62: type = LLM_TYPE_27B; break;
  1132. default: type = LLM_TYPE_UNKNOWN;
  1133. }
  1134. // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
  1135. hparams.f_attention_scale = type == LLM_TYPE_27B
  1136. ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
  1137. : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
  1138. } break;
  1139. case LLM_ARCH_GEMMA3N:
  1140. {
  1141. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1142. hparams.set_swa_pattern(5);
  1143. hparams.n_layer_kv_from_start = 20;
  1144. hparams.rope_freq_base_train_swa = 10000.0f;
  1145. hparams.rope_freq_scale_train_swa = 1.0f;
  1146. hparams.f_attention_scale = 1.0f;
  1147. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1148. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1149. switch (hparams.n_layer) {
  1150. case 30: type = LLM_TYPE_E2B; break;
  1151. case 35: type = LLM_TYPE_E4B; break;
  1152. default: type = LLM_TYPE_UNKNOWN;
  1153. }
  1154. } break;
  1155. case LLM_ARCH_GEMMA_EMBEDDING:
  1156. {
  1157. hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
  1158. hparams.set_swa_pattern(6);
  1159. hparams.causal_attn = false; // embeddings do not use causal attention
  1160. hparams.rope_freq_base_train_swa = 10000.0f;
  1161. hparams.rope_freq_scale_train_swa = 1.0f;
  1162. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1163. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1164. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  1165. switch (hparams.n_layer) {
  1166. case 24: type = LLM_TYPE_0_3B; break;
  1167. default: type = LLM_TYPE_UNKNOWN;
  1168. }
  1169. hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
  1170. } break;
  1171. case LLM_ARCH_STARCODER2:
  1172. {
  1173. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1174. switch (hparams.n_layer) {
  1175. case 30: type = LLM_TYPE_3B; break;
  1176. case 32: type = LLM_TYPE_7B; break;
  1177. case 40: type = LLM_TYPE_15B; break;
  1178. case 52: type = LLM_TYPE_20B; break; // granite
  1179. case 88: type = LLM_TYPE_34B; break; // granite
  1180. default: type = LLM_TYPE_UNKNOWN;
  1181. }
  1182. } break;
  1183. case LLM_ARCH_MAMBA:
  1184. {
  1185. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1186. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1187. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1188. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1189. ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
  1190. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1191. switch (hparams.n_layer) {
  1192. case 24:
  1193. switch (hparams.n_embd) {
  1194. case 768: type = LLM_TYPE_SMALL; break;
  1195. default: type = LLM_TYPE_UNKNOWN;
  1196. } break;
  1197. case 48:
  1198. switch (hparams.n_embd) {
  1199. case 1024: type = LLM_TYPE_MEDIUM; break;
  1200. case 1536: type = LLM_TYPE_LARGE; break;
  1201. case 2048: type = LLM_TYPE_XL; break;
  1202. default: type = LLM_TYPE_UNKNOWN;
  1203. } break;
  1204. case 64:
  1205. switch (hparams.n_embd) {
  1206. case 2560: type = LLM_TYPE_3B; break;
  1207. default: type = LLM_TYPE_UNKNOWN;
  1208. } break;
  1209. default: type = LLM_TYPE_UNKNOWN;
  1210. }
  1211. } break;
  1212. case LLM_ARCH_MAMBA2:
  1213. {
  1214. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1215. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1216. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1217. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1218. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1219. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1220. switch (hparams.n_layer) {
  1221. case 24:
  1222. switch (hparams.n_embd) {
  1223. case 768: type = LLM_TYPE_SMALL; break;
  1224. default: type = LLM_TYPE_UNKNOWN;
  1225. } break;
  1226. case 48:
  1227. switch (hparams.n_embd) {
  1228. case 1024: type = LLM_TYPE_MEDIUM; break;
  1229. case 1536: type = LLM_TYPE_LARGE; break;
  1230. case 2048: type = LLM_TYPE_XL; break;
  1231. default: type = LLM_TYPE_UNKNOWN;
  1232. } break;
  1233. case 64:
  1234. switch (hparams.n_embd) {
  1235. case 2560: type = LLM_TYPE_3B; break;
  1236. case 4096: type = LLM_TYPE_7B; break;
  1237. default: type = LLM_TYPE_UNKNOWN;
  1238. } break;
  1239. default: type = LLM_TYPE_UNKNOWN;
  1240. }
  1241. } break;
  1242. case LLM_ARCH_JAMBA:
  1243. {
  1244. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1245. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1246. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1247. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1248. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1249. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1250. hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
  1251. }
  1252. switch (hparams.n_layer) {
  1253. // TODO: Jamba layers are a bit heterogenous, so naming this is hard.
  1254. case 12: // 900M 8x???M
  1255. case 32: // 51B 16x?B
  1256. default: type = LLM_TYPE_UNKNOWN;
  1257. }
  1258. } break;
  1259. case LLM_ARCH_XVERSE:
  1260. {
  1261. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1262. switch (hparams.n_layer) {
  1263. case 32: type = LLM_TYPE_7B; break;
  1264. case 40: type = LLM_TYPE_13B; break;
  1265. case 80: type = LLM_TYPE_65B; break;
  1266. default: type = LLM_TYPE_UNKNOWN;
  1267. }
  1268. } break;
  1269. case LLM_ARCH_COMMAND_R:
  1270. {
  1271. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1272. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1273. switch (hparams.n_layer) {
  1274. case 40: type = LLM_TYPE_35B; break;
  1275. default: type = LLM_TYPE_UNKNOWN;
  1276. }
  1277. } break;
  1278. case LLM_ARCH_COHERE2:
  1279. {
  1280. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1281. hparams.set_swa_pattern(4);
  1282. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1283. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1284. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1285. switch (hparams.n_layer) {
  1286. case 32: type = LLM_TYPE_8B; break;
  1287. default: type = LLM_TYPE_UNKNOWN;
  1288. }
  1289. } break;
  1290. case LLM_ARCH_DBRX:
  1291. {
  1292. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1293. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
  1294. switch (hparams.n_layer) {
  1295. case 40: type = LLM_TYPE_16x12B; break;
  1296. default: type = LLM_TYPE_UNKNOWN;
  1297. }
  1298. } break;
  1299. case LLM_ARCH_OLMO:
  1300. {
  1301. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1302. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  1303. switch (hparams.n_layer) {
  1304. case 22: type = LLM_TYPE_1B; break;
  1305. case 32: type = LLM_TYPE_7B; break;
  1306. case 80: type = LLM_TYPE_70B; break;
  1307. default: type = LLM_TYPE_UNKNOWN;
  1308. }
  1309. } break;
  1310. case LLM_ARCH_OLMO2:
  1311. {
  1312. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1313. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1314. if (found_swa && hparams.n_swa > 0) {
  1315. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1316. hparams.set_swa_pattern(4);
  1317. } else {
  1318. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  1319. }
  1320. switch (hparams.n_layer) {
  1321. case 16: type = LLM_TYPE_1B; break;
  1322. case 32: type = LLM_TYPE_7B; break;
  1323. case 40: type = LLM_TYPE_13B; break;
  1324. case 64: type = LLM_TYPE_32B; break;
  1325. default: type = LLM_TYPE_UNKNOWN;
  1326. }
  1327. } break;
  1328. case LLM_ARCH_SEED_OSS:
  1329. {
  1330. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1331. switch (hparams.n_layer) {
  1332. case 64: type = LLM_TYPE_36B; break;
  1333. default: type = LLM_TYPE_UNKNOWN;
  1334. }
  1335. } break;
  1336. case LLM_ARCH_OLMOE:
  1337. {
  1338. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1339. switch (hparams.n_layer) {
  1340. case 16: type = LLM_TYPE_A1_7B; break;
  1341. default: type = LLM_TYPE_UNKNOWN;
  1342. }
  1343. } break;
  1344. case LLM_ARCH_OPENELM:
  1345. {
  1346. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1347. switch (hparams.n_layer) {
  1348. case 16: type = LLM_TYPE_270M; break;
  1349. case 20: type = LLM_TYPE_450M; break;
  1350. case 28: type = LLM_TYPE_1B; break;
  1351. case 36: type = LLM_TYPE_3B; break;
  1352. default: type = LLM_TYPE_UNKNOWN;
  1353. }
  1354. } break;
  1355. case LLM_ARCH_GPTNEOX:
  1356. {
  1357. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1358. ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
  1359. switch (hparams.n_layer) {
  1360. case 6:
  1361. switch (hparams.n_ff()) {
  1362. case 512: type = LLM_TYPE_14M; break;
  1363. case 2048: type = LLM_TYPE_70M; break;
  1364. default: type = LLM_TYPE_UNKNOWN;
  1365. } break;
  1366. case 12:
  1367. switch (hparams.n_ff()) {
  1368. case 3072: type = LLM_TYPE_160M; break;
  1369. default: type = LLM_TYPE_UNKNOWN;
  1370. } break;
  1371. case 16:
  1372. switch (hparams.n_ff()) {
  1373. case 8192: type = LLM_TYPE_1B; break;
  1374. default: type = LLM_TYPE_UNKNOWN;
  1375. } break;
  1376. case 24:
  1377. switch (hparams.n_ff()) {
  1378. case 4096: type = LLM_TYPE_410M; break;
  1379. case 8192: type = LLM_TYPE_1_4B; break;
  1380. default: type = LLM_TYPE_UNKNOWN;
  1381. } break;
  1382. case 32:
  1383. switch (hparams.n_ff()) {
  1384. case 10240: type = LLM_TYPE_2_8B; break;
  1385. case 16384: type = LLM_TYPE_6_9B; break;
  1386. default: type = LLM_TYPE_UNKNOWN;
  1387. } break;
  1388. case 36:
  1389. switch (hparams.n_ff()) {
  1390. case 20480: type = LLM_TYPE_12B; break;
  1391. default: type = LLM_TYPE_UNKNOWN;
  1392. } break;
  1393. case 44:
  1394. switch (hparams.n_ff()) {
  1395. case 24576: type = LLM_TYPE_20B; break;
  1396. default: type = LLM_TYPE_UNKNOWN;
  1397. } break;
  1398. default: type = LLM_TYPE_UNKNOWN;
  1399. }
  1400. } break;
  1401. case LLM_ARCH_ARCTIC:
  1402. {
  1403. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1404. if (hparams.n_expert == 128) {
  1405. switch (hparams.n_layer) {
  1406. case 35: type = LLM_TYPE_10B_128x3_66B; break;
  1407. default: type = LLM_TYPE_UNKNOWN;
  1408. }
  1409. } else {
  1410. type = LLM_TYPE_UNKNOWN;
  1411. }
  1412. } break;
  1413. case LLM_ARCH_DEEPSEEK:
  1414. {
  1415. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1416. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1417. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1418. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1419. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1420. switch (hparams.n_layer) {
  1421. case 28: type = LLM_TYPE_20B; break;
  1422. default: type = LLM_TYPE_UNKNOWN;
  1423. }
  1424. } break;
  1425. case LLM_ARCH_DEEPSEEK2:
  1426. {
  1427. bool is_lite = (hparams.n_layer == 27);
  1428. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1429. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1430. if (!is_lite) {
  1431. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  1432. }
  1433. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  1434. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
  1435. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
  1436. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1437. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1438. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1439. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1440. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1441. if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
  1442. // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
  1443. // that have no expert_gating_func model parameter set
  1444. hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
  1445. }
  1446. ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
  1447. switch (hparams.n_layer) {
  1448. case 27: type = LLM_TYPE_16B; break;
  1449. case 60: type = LLM_TYPE_236B; break;
  1450. case 61: type = LLM_TYPE_671B; break;
  1451. default: type = LLM_TYPE_UNKNOWN;
  1452. }
  1453. } break;
  1454. case LLM_ARCH_PLM:
  1455. {
  1456. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1457. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  1458. switch (hparams.n_layer) {
  1459. case 32: type = LLM_TYPE_1_8B; break;
  1460. default: type = LLM_TYPE_UNKNOWN;
  1461. }
  1462. } break;
  1463. case LLM_ARCH_CHATGLM:
  1464. {
  1465. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1466. switch (hparams.n_layer) {
  1467. case 28: {
  1468. if (hparams.n_head(0) == 16) {
  1469. type = LLM_TYPE_1_5B;
  1470. } else {
  1471. type = LLM_TYPE_6B;
  1472. }
  1473. } break;
  1474. case 40: {
  1475. if (hparams.n_head(0) == 24) {
  1476. type = LLM_TYPE_4B;
  1477. } else {
  1478. type = LLM_TYPE_9B;
  1479. }
  1480. } break;
  1481. default: type = LLM_TYPE_UNKNOWN;
  1482. }
  1483. } break;
  1484. case LLM_ARCH_GLM4:
  1485. {
  1486. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1487. switch (hparams.n_layer) {
  1488. case 40: type = LLM_TYPE_9B; break;
  1489. case 61: type = LLM_TYPE_32B; break;
  1490. default: type = LLM_TYPE_UNKNOWN;
  1491. }
  1492. } break;
  1493. case LLM_ARCH_GLM4_MOE:
  1494. {
  1495. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1496. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1497. // MoE parameters
  1498. ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
  1499. ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
  1500. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1501. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
  1502. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1503. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1504. // Expert gating function (GLM-4.5 uses sigmoid)
  1505. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1506. if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
  1507. hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
  1508. }
  1509. // NextN/MTP parameters
  1510. ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
  1511. // TODO: when MTP is implemented, this should probably be updated if needed
  1512. hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
  1513. switch (hparams.n_layer) {
  1514. case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
  1515. case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
  1516. default: type = LLM_TYPE_UNKNOWN;
  1517. }
  1518. } break;
  1519. case LLM_ARCH_BITNET:
  1520. {
  1521. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1522. switch (hparams.n_layer) {
  1523. case 26: type = LLM_TYPE_3B; break;
  1524. default: type = LLM_TYPE_UNKNOWN;
  1525. }
  1526. } break;
  1527. case LLM_ARCH_T5:
  1528. {
  1529. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1530. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  1531. uint32_t dec_start_token_id;
  1532. if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
  1533. hparams.dec_start_token_id = dec_start_token_id;
  1534. }
  1535. hparams.dec_n_layer = hparams.n_layer;
  1536. ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false);
  1537. switch (hparams.n_layer) {
  1538. case 6: type = LLM_TYPE_60M; break; // t5-small
  1539. case 8: type = LLM_TYPE_80M; break; // flan-t5-small
  1540. case 12:
  1541. switch (hparams.n_ff()) {
  1542. case 3072: type = LLM_TYPE_220M; break; // t5-base
  1543. case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
  1544. default: type = LLM_TYPE_UNKNOWN;
  1545. } break;
  1546. case 24:
  1547. switch (hparams.n_ff()) {
  1548. case 4096: type = LLM_TYPE_770M; break; // t5-large
  1549. case 2816: type = LLM_TYPE_780M; break; // flan-t5-large
  1550. case 16384: type = LLM_TYPE_3B; break; // t5-3b
  1551. case 5120: type = LLM_TYPE_3B; break; // flan-t5-xl
  1552. case 65536: type = LLM_TYPE_11B; break; // t5-11b
  1553. case 10240: type = LLM_TYPE_11B; break; // flan-t5-xxl
  1554. default: type = LLM_TYPE_UNKNOWN;
  1555. } break;
  1556. default: type = LLM_TYPE_UNKNOWN;
  1557. }
  1558. } break;
  1559. case LLM_ARCH_T5ENCODER:
  1560. {
  1561. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1562. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  1563. type = LLM_TYPE_UNKNOWN;
  1564. } break;
  1565. case LLM_ARCH_JAIS:
  1566. {
  1567. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1568. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  1569. switch (hparams.n_layer) {
  1570. case 24: type = LLM_TYPE_1_3B; break;
  1571. case 40: type = LLM_TYPE_13B; break;
  1572. /* TODO: add variants */
  1573. default: type = LLM_TYPE_UNKNOWN;
  1574. }
  1575. } break;
  1576. case LLM_ARCH_NEMOTRON:
  1577. {
  1578. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1579. switch (hparams.n_layer) {
  1580. case 32: type = LLM_TYPE_4B; break;
  1581. default: type = LLM_TYPE_UNKNOWN;
  1582. }
  1583. } break;
  1584. case LLM_ARCH_NEMOTRON_H:
  1585. {
  1586. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1587. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1588. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1589. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1590. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1591. // A layer is recurrent IFF the n_head_kv value is set to 0 and
  1592. // the n_ff value is set to 0
  1593. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1594. hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
  1595. }
  1596. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1597. switch (hparams.n_layer) {
  1598. case 56: type = LLM_TYPE_9B; break;
  1599. default: type = LLM_TYPE_UNKNOWN;
  1600. }
  1601. } break;
  1602. case LLM_ARCH_EXAONE:
  1603. {
  1604. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1605. switch (hparams.n_layer) {
  1606. case 32: type = LLM_TYPE_8B; break;
  1607. default: type = LLM_TYPE_UNKNOWN;
  1608. }
  1609. } break;
  1610. case LLM_ARCH_EXAONE4:
  1611. {
  1612. if (hparams.n_layer == 64) { // 32B
  1613. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1614. hparams.n_swa = 4096;
  1615. hparams.set_swa_pattern(4);
  1616. }
  1617. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1618. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1619. switch (hparams.n_layer) {
  1620. case 30: type = LLM_TYPE_1_2B; break;
  1621. case 64: type = LLM_TYPE_32B; break;
  1622. default: type = LLM_TYPE_UNKNOWN;
  1623. }
  1624. } break;
  1625. case LLM_ARCH_RWKV6:
  1626. case LLM_ARCH_RWKV6QWEN2:
  1627. {
  1628. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
  1629. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
  1630. ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
  1631. ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
  1632. ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
  1633. ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
  1634. ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
  1635. switch (hparams.n_layer) {
  1636. case 24: type = LLM_TYPE_1_6B; break;
  1637. case 32:
  1638. switch (hparams.n_embd) {
  1639. case 2560: type = LLM_TYPE_3B; break;
  1640. case 4096: type = LLM_TYPE_7B; break;
  1641. default: type = LLM_TYPE_UNKNOWN;
  1642. } break;
  1643. case 61: type = LLM_TYPE_14B; break;
  1644. case 64: type = LLM_TYPE_32B; break;
  1645. default: type = LLM_TYPE_UNKNOWN;
  1646. }
  1647. } break;
  1648. case LLM_ARCH_RWKV7:
  1649. case LLM_ARCH_ARWKV7:
  1650. {
  1651. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
  1652. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
  1653. ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
  1654. ml.get_key(LLM_KV_ATTENTION_DECAY_LORA_RANK, hparams.n_lora_decay);
  1655. ml.get_key(LLM_KV_ATTENTION_ICLR_LORA_RANK, hparams.n_lora_iclr);
  1656. ml.get_key(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix);
  1657. ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate, false);
  1658. ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
  1659. switch (hparams.n_layer) {
  1660. case 12:
  1661. switch (hparams.n_embd) {
  1662. case 768: type = LLM_TYPE_190M; break;
  1663. default: type = LLM_TYPE_UNKNOWN;
  1664. } break;
  1665. case 24:
  1666. switch (hparams.n_embd) {
  1667. case 1024: type = LLM_TYPE_450M; break;
  1668. case 2048: type = LLM_TYPE_1_5B; break;
  1669. default: type = LLM_TYPE_UNKNOWN;
  1670. } break;
  1671. case 28:
  1672. switch (hparams.n_embd) {
  1673. case 1536: type = LLM_TYPE_1_5B; break;
  1674. case 3584: type = LLM_TYPE_7B; break;
  1675. default: type = LLM_TYPE_UNKNOWN;
  1676. } break;
  1677. case 32:
  1678. switch (hparams.n_embd) {
  1679. case 2560: type = LLM_TYPE_2_9B; break;
  1680. case 4096: type = LLM_TYPE_7B; break;
  1681. default: type = LLM_TYPE_UNKNOWN;
  1682. } break;
  1683. case 61:
  1684. switch (hparams.n_embd) {
  1685. case 4096: type = LLM_TYPE_14B; break;
  1686. default: type = LLM_TYPE_UNKNOWN;
  1687. } break;
  1688. default: type = LLM_TYPE_UNKNOWN;
  1689. }
  1690. } break;
  1691. case LLM_ARCH_GRANITE:
  1692. case LLM_ARCH_GRANITE_MOE:
  1693. {
  1694. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1695. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1696. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  1697. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  1698. ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
  1699. // Granite uses rope_finetuned as a switch for rope, so default to true
  1700. bool rope_finetuned = true;
  1701. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  1702. hparams.rope_finetuned = rope_finetuned;
  1703. switch (hparams.n_layer) {
  1704. case 32: type = LLM_TYPE_3B; break;
  1705. case 40: type = LLM_TYPE_3B; break;
  1706. // Add additional layer/vocab/etc checks here for other model sizes
  1707. default: type = LLM_TYPE_UNKNOWN;
  1708. }
  1709. // For Granite MoE Shared
  1710. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
  1711. } break;
  1712. case LLM_ARCH_GRANITE_HYBRID:
  1713. {
  1714. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1715. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /* required */ false);
  1716. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /* required */ false);
  1717. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /* required */ false);
  1718. ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, /* required */ false);
  1719. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1720. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1721. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1722. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1723. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1724. // Granite uses rope_finetuned as a switch for rope, so default to true
  1725. bool rope_finetuned = true;
  1726. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  1727. hparams.rope_finetuned = rope_finetuned;
  1728. // A layer is recurrent IFF the n_head_kv value is set to 0
  1729. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1730. hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
  1731. }
  1732. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1733. switch (hparams.n_layer) {
  1734. // TODO: Add llm type label (not sure this is useful)
  1735. default: type = LLM_TYPE_UNKNOWN;
  1736. }
  1737. // For Granite MoE Shared
  1738. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
  1739. } break;
  1740. case LLM_ARCH_QWEN3NEXT:
  1741. {
  1742. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  1743. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  1744. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1745. // Load linear attention (gated delta net) parameters
  1746. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1747. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1748. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1749. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1750. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1751. // Mark recurrent layers (linear attention layers)
  1752. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1753. hparams.recurrent_layer_arr[i] = ((i + 1) % 4 != 0); // TODO: extract the magic 4 from "full_attention_interval"
  1754. }
  1755. switch (hparams.n_layer) {
  1756. case 80: type = LLM_TYPE_80B_A3B; break;
  1757. default: type = LLM_TYPE_UNKNOWN;
  1758. }
  1759. } break;
  1760. case LLM_ARCH_CHAMELEON:
  1761. {
  1762. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1763. hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
  1764. ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
  1765. switch (hparams.n_layer) {
  1766. case 32: type = LLM_TYPE_7B; break;
  1767. case 48: type = LLM_TYPE_34B; break;
  1768. default: type = LLM_TYPE_UNKNOWN;
  1769. }
  1770. } break;
  1771. case LLM_ARCH_WAVTOKENIZER_DEC:
  1772. {
  1773. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1774. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
  1775. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
  1776. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  1777. } break;
  1778. case LLM_ARCH_BAILINGMOE:
  1779. {
  1780. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1781. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1782. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1783. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1784. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1785. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1786. switch (hparams.n_layer) {
  1787. case 28: type = LLM_TYPE_16B; break;
  1788. case 88: type = LLM_TYPE_290B; break;
  1789. default: type = LLM_TYPE_UNKNOWN;
  1790. }
  1791. } break;
  1792. case LLM_ARCH_DOTS1:
  1793. {
  1794. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1795. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1796. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1797. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1798. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1799. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1800. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1801. switch (hparams.n_layer) {
  1802. case 62: type = LLM_TYPE_142B; break;
  1803. default: type = LLM_TYPE_UNKNOWN;
  1804. }
  1805. } break;
  1806. case LLM_ARCH_ERNIE4_5:
  1807. case LLM_ARCH_ERNIE4_5_MOE:
  1808. {
  1809. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1810. if (arch == LLM_ARCH_ERNIE4_5_MOE) {
  1811. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1812. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  1813. ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
  1814. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1815. }
  1816. switch (hparams.n_layer) {
  1817. case 18: type = LLM_TYPE_0_3B; break;
  1818. case 28: type = LLM_TYPE_21B_A3B; break;
  1819. case 54: type = LLM_TYPE_300B_A47B; break;
  1820. default: type = LLM_TYPE_UNKNOWN;
  1821. }
  1822. } break;
  1823. case LLM_ARCH_FALCON_H1:
  1824. {
  1825. // Common parameters
  1826. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1827. // SSM parameters
  1828. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1829. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1830. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1831. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1832. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1833. std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
  1834. switch (hparams.n_layer) {
  1835. case 36:
  1836. type = LLM_TYPE_0_5B; break;
  1837. case 24:
  1838. type = LLM_TYPE_1_5B; break;
  1839. case 66:
  1840. type = LLM_TYPE_1B; break;
  1841. case 32:
  1842. type = LLM_TYPE_3B; break;
  1843. case 44:
  1844. type = LLM_TYPE_7B; break;
  1845. case 72:
  1846. type = LLM_TYPE_34B; break;
  1847. default:
  1848. type = LLM_TYPE_UNKNOWN;
  1849. }
  1850. } break;
  1851. case LLM_ARCH_HUNYUAN_MOE:
  1852. {
  1853. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1854. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1855. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
  1856. switch (hparams.n_layer) {
  1857. case 32: type = LLM_TYPE_A13B; break;
  1858. default: type = LLM_TYPE_UNKNOWN;
  1859. }
  1860. } break;
  1861. case LLM_ARCH_HUNYUAN_DENSE:
  1862. {
  1863. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1864. switch (hparams.n_embd) {
  1865. case 1024: type = LLM_TYPE_0_5B; break;
  1866. case 2048: type = LLM_TYPE_1_8B; break;
  1867. case 3072: type = LLM_TYPE_4B; break;
  1868. case 4096: type = LLM_TYPE_7B; break;
  1869. default: type = LLM_TYPE_UNKNOWN;
  1870. }
  1871. } break;
  1872. case LLM_ARCH_SMOLLM3:
  1873. {
  1874. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1875. hparams.n_no_rope_layer_step = 4;
  1876. switch (hparams.n_layer) {
  1877. case 36: type = LLM_TYPE_3B; break;
  1878. default: type = LLM_TYPE_UNKNOWN;
  1879. }
  1880. } break;
  1881. case LLM_ARCH_OPENAI_MOE:
  1882. {
  1883. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1884. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1885. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1886. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1887. hparams.set_swa_pattern(2);
  1888. switch (hparams.n_layer) {
  1889. case 24: type = LLM_TYPE_20B; break;
  1890. case 36: type = LLM_TYPE_120B; break;
  1891. default: type = LLM_TYPE_UNKNOWN;
  1892. }
  1893. } break;
  1894. case LLM_ARCH_LFM2:
  1895. {
  1896. ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
  1897. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1898. for (uint32_t il = 0; il < hparams.n_layer; ++il) {
  1899. hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
  1900. }
  1901. switch (hparams.n_embd) {
  1902. case 1024: type = LLM_TYPE_350M; break;
  1903. case 1536: type = LLM_TYPE_700M; break;
  1904. case 2048: type = LLM_TYPE_1_2B; break;
  1905. default: type = LLM_TYPE_UNKNOWN;
  1906. }
  1907. } break;
  1908. case LLM_ARCH_SMALLTHINKER:
  1909. {
  1910. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1911. if (found_swa && hparams.n_swa > 0) {
  1912. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1913. hparams.n_swa = 4096;
  1914. hparams.set_swa_pattern(4, true);
  1915. } else {
  1916. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  1917. hparams.n_no_rope_layer_step = hparams.n_layer;
  1918. }
  1919. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  1920. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1921. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1922. switch (hparams.n_layer) {
  1923. case 32: type = LLM_TYPE_4B; break;
  1924. case 52: type = LLM_TYPE_20B; break;
  1925. default: type = LLM_TYPE_UNKNOWN;
  1926. }
  1927. } break;
  1928. default: throw std::runtime_error("unsupported model architecture");
  1929. }
  1930. pimpl->n_bytes = ml.n_bytes;
  1931. pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
  1932. if (hparams.f_max_alibi_bias > 0.0f) {
  1933. hparams.use_alibi = true;
  1934. }
  1935. hparams.rope_type = llama_model_rope_type(this);
  1936. }
  1937. void llama_model::load_vocab(llama_model_loader & ml) {
  1938. const auto kv = LLM_KV(arch);
  1939. vocab.load(ml, kv);
  1940. }
  1941. bool llama_model::load_tensors(llama_model_loader & ml) {
  1942. const auto & split_mode = params.split_mode;
  1943. const auto & n_gpu_layers = params.n_gpu_layers;
  1944. const auto & use_mlock = params.use_mlock;
  1945. const auto & tensor_split = params.tensor_split;
  1946. const int n_layer = hparams.n_layer;
  1947. const bool use_mmap_buffer = true;
  1948. LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
  1949. // build a list of buffer types for the CPU and GPU devices
  1950. pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts);
  1951. for (auto * dev : devices) {
  1952. buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
  1953. // add CPU buffer types as a fallback
  1954. buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
  1955. pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
  1956. }
  1957. // calculate the split points
  1958. bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
  1959. std::vector<float> splits(n_devices());
  1960. if (all_zero) {
  1961. // default split, by free memory
  1962. for (size_t i = 0; i < n_devices(); ++i) {
  1963. ggml_backend_dev_t dev = devices[i];
  1964. size_t total;
  1965. size_t free;
  1966. ggml_backend_dev_memory(dev, &free, &total);
  1967. splits[i] = free;
  1968. }
  1969. } else {
  1970. std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
  1971. }
  1972. // sum and normalize the splits to get the split points
  1973. float split_sum = 0.0f;
  1974. for (size_t i = 0; i < n_devices(); ++i) {
  1975. split_sum += splits[i];
  1976. splits[i] = split_sum;
  1977. }
  1978. for (size_t i = 0; i < n_devices(); ++i) {
  1979. splits[i] /= split_sum;
  1980. }
  1981. ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  1982. if (cpu_dev == nullptr) {
  1983. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  1984. }
  1985. const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
  1986. const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
  1987. auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
  1988. const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
  1989. if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
  1990. LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
  1991. return {cpu_dev, &pimpl->cpu_buft_list};
  1992. }
  1993. const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
  1994. auto * dev = devices.at(layer_gpu);
  1995. LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
  1996. return {dev, &pimpl->gpu_buft_list.at(dev)};
  1997. };
  1998. // assign the input layer
  1999. // there is very little benefit to offloading the input layer, so always keep it on the CPU
  2000. pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
  2001. // assign the repeating layers to the devices according to the splits
  2002. pimpl->dev_layer.resize(n_layer);
  2003. for (int il = 0; il < n_layer; ++il) {
  2004. pimpl->dev_layer[il] = get_layer_buft_list(il);
  2005. }
  2006. // assign the output layer
  2007. pimpl->dev_output = get_layer_buft_list(n_layer);
  2008. // one ggml context per buffer type
  2009. int max_n_tensors = ml.n_tensors;
  2010. max_n_tensors += 1; // duplicated output tensor
  2011. max_n_tensors += n_layer*2; // duplicated rope freq tensors
  2012. const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
  2013. std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
  2014. auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
  2015. auto it = ctx_map.find(buft);
  2016. if (it == ctx_map.end()) {
  2017. ggml_init_params params = {
  2018. /*.mem_size =*/ ctx_size,
  2019. /*.mem_buffer =*/ NULL,
  2020. /*.no_alloc =*/ true,
  2021. };
  2022. ggml_context * ctx = ggml_init(params);
  2023. if (!ctx) {
  2024. throw std::runtime_error(format("failed to create ggml context"));
  2025. }
  2026. ctx_map[buft] = ctx;
  2027. pimpl->ctxs.emplace_back(ctx);
  2028. return ctx;
  2029. }
  2030. return it->second;
  2031. };
  2032. const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
  2033. const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
  2034. const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
  2035. // create tensors for the weights
  2036. {
  2037. // note: cast to int64_t since we will use these for the tensor dimensions
  2038. const int64_t n_head = hparams.n_head();
  2039. const int64_t n_head_kv = hparams.n_head_kv();
  2040. const int64_t n_embd = hparams.n_embd;
  2041. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
  2042. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
  2043. const int64_t n_embd_head_k = hparams.n_embd_head_k;
  2044. const int64_t n_embd_head_v = hparams.n_embd_head_v;
  2045. const int64_t n_ff = hparams.n_ff();
  2046. const int64_t n_embd_gqa = n_embd_v_gqa;
  2047. const int64_t n_vocab = vocab.n_tokens();
  2048. const int64_t n_token_types = vocab.n_token_types();
  2049. const int64_t n_rot = hparams.n_rot;
  2050. const int64_t n_expert = hparams.n_expert;
  2051. const int64_t n_expert_used = hparams.n_expert_used;
  2052. const int64_t n_ctx_train = hparams.n_ctx_train;
  2053. if (n_expert > 0 && hparams.n_expert_used == 0) {
  2054. throw std::runtime_error("model has expert layers but no expert layers are used");
  2055. }
  2056. int n_moved_tensors = 0;
  2057. ggml_tensor * first_moved_tensor = nullptr;
  2058. ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
  2059. ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
  2060. auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
  2061. ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
  2062. if (!t_meta) {
  2063. if (flags & TENSOR_NOT_REQUIRED) {
  2064. return nullptr;
  2065. }
  2066. throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
  2067. }
  2068. // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
  2069. // the tensor is duplicated
  2070. // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
  2071. llm_tensor tn_tensor = tn.tensor;
  2072. if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
  2073. tn_tensor = LLM_TENSOR_OUTPUT;
  2074. }
  2075. llm_tensor_info info;
  2076. try {
  2077. info = llm_tensor_info_for(tn_tensor);
  2078. } catch (const std::out_of_range & e) {
  2079. throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
  2080. }
  2081. // skip unused tensors
  2082. if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
  2083. const size_t nbytes = ggml_nbytes(t_meta);
  2084. LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
  2085. ml.size_data -= nbytes;
  2086. ml.n_created++;
  2087. return nullptr;
  2088. }
  2089. // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
  2090. ggml_op op;
  2091. bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
  2092. if (bias) {
  2093. if (info.op == GGML_OP_MUL_MAT_ID) {
  2094. op = GGML_OP_ADD_ID;
  2095. } else {
  2096. op = GGML_OP_ADD;
  2097. }
  2098. } else {
  2099. op = info.op;
  2100. }
  2101. // sanity checks
  2102. if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
  2103. if (tn.bid != -1) {
  2104. GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
  2105. }
  2106. } else {
  2107. if (tn.bid == -1) {
  2108. GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
  2109. }
  2110. }
  2111. // select the buffer type for this tensor
  2112. buft_list_t * buft_list;
  2113. switch (info.layer) {
  2114. case LLM_TENSOR_LAYER_INPUT:
  2115. buft_list = pimpl->dev_input.buft_list;
  2116. break;
  2117. case LLM_TENSOR_LAYER_OUTPUT:
  2118. buft_list = pimpl->dev_output.buft_list;
  2119. break;
  2120. case LLM_TENSOR_LAYER_REPEATING:
  2121. buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
  2122. break;
  2123. default:
  2124. GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
  2125. }
  2126. ggml_backend_buffer_type_t buft = nullptr;
  2127. // check overrides
  2128. if (ml.tensor_buft_overrides) {
  2129. std::string tensor_name = tn.str();
  2130. for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
  2131. std::regex pattern(overrides->pattern);
  2132. if (std::regex_search(tensor_name, pattern)) {
  2133. if (overrides->buft == ggml_backend_cpu_buffer_type()) {
  2134. // when overriding to a CPU buffer, consider the extra buffer types
  2135. buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
  2136. } else {
  2137. buft = overrides->buft;
  2138. }
  2139. LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
  2140. tensor_name.c_str(),
  2141. ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
  2142. ggml_backend_buft_name(buft));
  2143. break;
  2144. }
  2145. }
  2146. }
  2147. if (!buft) {
  2148. buft = select_weight_buft(hparams, t_meta, op, *buft_list);
  2149. if (!buft) {
  2150. throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
  2151. }
  2152. }
  2153. // avoid using a host buffer when using mmap
  2154. auto * buft_dev = ggml_backend_buft_get_device(buft);
  2155. if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
  2156. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  2157. if (!cpu_dev) {
  2158. throw std::runtime_error("no CPU backend found");
  2159. }
  2160. buft = ggml_backend_dev_buffer_type(cpu_dev);
  2161. }
  2162. if (buft != buft_list->front().second) {
  2163. n_moved_tensors++;
  2164. if (!first_moved_tensor) {
  2165. first_moved_tensor = t_meta;
  2166. first_moved_from_buft = buft_list->front().second;
  2167. first_moved_to_buft = buft;
  2168. }
  2169. }
  2170. ggml_context * ctx = ctx_for_buft(buft);
  2171. // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
  2172. if (flags & TENSOR_DUPLICATED) {
  2173. ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
  2174. if (t) {
  2175. return t;
  2176. }
  2177. }
  2178. return ml.create_tensor(ctx, tn, ne, flags);
  2179. };
  2180. layers.resize(n_layer);
  2181. // TODO: move to a separate function
  2182. const auto tn = LLM_TN(arch);
  2183. switch (arch) {
  2184. case LLM_ARCH_LLAMA:
  2185. case LLM_ARCH_REFACT:
  2186. case LLM_ARCH_MINICPM:
  2187. case LLM_ARCH_GRANITE:
  2188. case LLM_ARCH_GRANITE_MOE:
  2189. {
  2190. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2191. // output
  2192. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2193. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2194. // if output is NULL, init from the input tok embed
  2195. if (output == NULL) {
  2196. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2197. }
  2198. for (int i = 0; i < n_layer; ++i) {
  2199. auto & layer = layers[i];
  2200. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2201. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2202. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2203. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2204. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2205. // optional bias tensors
  2206. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2207. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2208. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2209. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2210. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2211. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  2212. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2213. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2214. }
  2215. else {
  2216. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2217. }
  2218. if (n_expert == 0) {
  2219. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2220. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2221. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2222. // optional MLP bias
  2223. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2224. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2225. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2226. } else {
  2227. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2228. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
  2229. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  2230. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2231. // For Granite MoE Shared
  2232. if (hparams.n_ff_shexp > 0) {
  2233. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  2234. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  2235. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
  2236. }
  2237. }
  2238. }
  2239. } break;
  2240. case LLM_ARCH_QWEN3NEXT:
  2241. {
  2242. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2243. // output
  2244. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2245. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
  2246. // if output is NULL, init from the input tok embed
  2247. if (output == NULL) {
  2248. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
  2249. }
  2250. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2251. // Calculate dimensions from hyperparameters
  2252. const int64_t head_k_dim = hparams.ssm_d_state;
  2253. const int64_t head_v_dim = hparams.ssm_d_state;
  2254. const int64_t n_k_heads = hparams.ssm_n_group;
  2255. const int64_t n_v_heads = hparams.ssm_dt_rank;
  2256. const int64_t key_dim = head_k_dim * n_k_heads;
  2257. const int64_t value_dim = head_v_dim * n_v_heads;
  2258. const int64_t conv_dim = key_dim * 2 + value_dim;
  2259. // Calculate projection sizes
  2260. const int64_t qkvz_projection_size = key_dim * 2 + value_dim * 2;
  2261. const int64_t ba_projection_size = n_v_heads * 2;
  2262. for (int i = 0; i < n_layer; ++i) {
  2263. auto & layer = layers[i];
  2264. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  2265. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
  2266. if (!hparams.is_recurrent(i)) {
  2267. // Attention layers
  2268. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
  2269. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
  2270. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
  2271. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
  2272. // Q/K normalization for attention layers
  2273. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
  2274. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
  2275. // attn gate
  2276. layer.wq_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
  2277. } else {
  2278. // Linear attention (gated delta net) specific tensors
  2279. // Create tensors with calculated dimensions
  2280. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_projection_size }, 0);
  2281. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
  2282. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
  2283. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), { hparams.ssm_dt_rank }, 0);
  2284. layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_projection_size }, 0);
  2285. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
  2286. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
  2287. }
  2288. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
  2289. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
  2290. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
  2291. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
  2292. // Shared experts
  2293. layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
  2294. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
  2295. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
  2296. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
  2297. }
  2298. }
  2299. break;
  2300. case LLM_ARCH_LLADA:
  2301. {
  2302. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2303. // output
  2304. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2305. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
  2306. // if output is NULL, init from the input tok embed
  2307. if (output == NULL) {
  2308. output =
  2309. create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
  2310. }
  2311. for (int i = 0; i < n_layer; ++i) {
  2312. auto & layer = layers[i];
  2313. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  2314. // Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
  2315. layer.wq =
  2316. create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
  2317. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
  2318. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
  2319. // No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
  2320. layer.wo =
  2321. create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
  2322. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
  2323. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  2324. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
  2325. TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2326. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
  2327. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  2328. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
  2329. // optional MLP bias
  2330. layer.ffn_gate_b =
  2331. create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
  2332. layer.ffn_down_b =
  2333. create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
  2334. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
  2335. }
  2336. }
  2337. break;
  2338. case LLM_ARCH_LLADA_MOE:
  2339. {
  2340. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2341. // output
  2342. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2343. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2344. GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe");
  2345. GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe");
  2346. for (int i = 0; i < n_layer; ++i) {
  2347. auto & layer = layers[i];
  2348. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2349. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2350. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2351. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2352. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2353. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2354. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2355. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2356. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2357. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2358. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2359. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2360. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2361. }
  2362. } break;
  2363. case LLM_ARCH_LLAMA4:
  2364. {
  2365. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2366. // output
  2367. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2368. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2369. // if output is NULL, init from the input tok embed
  2370. if (output == NULL) {
  2371. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2372. }
  2373. for (int i = 0; i < n_layer; ++i) {
  2374. bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
  2375. auto & layer = layers[i];
  2376. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2377. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2378. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2379. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2380. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2381. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2382. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2383. if (is_moe_layer) {
  2384. int n_ff_exp = hparams.n_ff_exp;
  2385. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2386. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  2387. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
  2388. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  2389. // Shared expert
  2390. const int64_t n_ff_shexp = n_ff_exp;
  2391. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2392. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
  2393. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2394. } else {
  2395. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2396. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2397. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2398. }
  2399. }
  2400. } break;
  2401. case LLM_ARCH_DECI:
  2402. {
  2403. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2404. // output
  2405. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2406. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2407. // if output is NULL, init from the input tok embed
  2408. if (output == NULL) {
  2409. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2410. }
  2411. for (int i = 0; i < n_layer; ++i) {
  2412. auto & layer = layers[i];
  2413. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
  2414. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
  2415. const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
  2416. const int64_t n_ff = hparams.n_ff(i);
  2417. const int64_t n_head = hparams.n_head(i);
  2418. const int64_t n_head_kv = hparams.n_head_kv(i);
  2419. if (n_head_kv == 0 && n_head > 0) {
  2420. // linear attention for DeciLMCausalModel
  2421. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2422. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2423. }
  2424. else if (n_head_kv > 0) {
  2425. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2426. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2427. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2428. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2429. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2430. }
  2431. // optional bias tensors
  2432. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2433. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2434. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2435. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2436. if (n_ff > 0) {
  2437. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2438. }
  2439. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  2440. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2441. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2442. }
  2443. else {
  2444. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2445. }
  2446. if (n_ff > 0) {
  2447. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2448. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2449. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2450. }
  2451. // optional MLP bias
  2452. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2453. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2454. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2455. }
  2456. } break;
  2457. case LLM_ARCH_MINICPM3:
  2458. {
  2459. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  2460. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  2461. const int64_t q_lora_rank = hparams.n_lora_q;
  2462. const int64_t kv_lora_rank = hparams.n_lora_kv;
  2463. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2464. // output
  2465. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2466. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2467. // if output is NULL, init from the input tok embed
  2468. if (output == NULL) {
  2469. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2470. }
  2471. for (int i = 0; i < n_layer; ++i) {
  2472. auto & layer = layers[i];
  2473. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2474. layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
  2475. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  2476. layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
  2477. layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
  2478. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
  2479. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
  2480. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
  2481. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2482. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2483. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2484. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2485. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2486. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2487. }
  2488. } break;
  2489. case LLM_ARCH_GROK:
  2490. {
  2491. if (n_expert == 0) {
  2492. throw std::runtime_error("Grok model cannot have zero experts");
  2493. }
  2494. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2495. // output
  2496. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2497. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2498. // if output is NULL, init from the input tok embed
  2499. if (output == NULL) {
  2500. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2501. }
  2502. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff/* / n_expert_used*/; // grok-1 n_ff_exp == n_ff
  2503. for (int i = 0; i < n_layer; ++i) {
  2504. auto & layer = layers[i];
  2505. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2506. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2507. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2508. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2509. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2510. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  2511. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2512. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2513. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, TENSOR_NOT_REQUIRED);
  2514. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2515. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2516. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
  2517. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2518. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  2519. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2520. if (!layer.ffn_post_norm) {
  2521. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  2522. }
  2523. }
  2524. } break;
  2525. case LLM_ARCH_DBRX:
  2526. {
  2527. if (n_expert == 0) {
  2528. throw std::runtime_error("DBRX model cannot have zero experts");
  2529. }
  2530. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2531. // output
  2532. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2533. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2534. for (int i = 0; i < n_layer; ++i) {
  2535. auto & layer = layers[i];
  2536. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2537. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2538. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2539. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  2540. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2541. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2542. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  2543. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2544. }
  2545. } break;
  2546. case LLM_ARCH_BAICHUAN:
  2547. {
  2548. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2549. {
  2550. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2551. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2552. }
  2553. for (int i = 0; i < n_layer; ++i) {
  2554. auto & layer = layers[i];
  2555. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2556. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2557. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2558. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2559. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2560. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2561. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2562. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2563. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2564. }
  2565. } break;
  2566. case LLM_ARCH_FALCON:
  2567. {
  2568. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2569. // output
  2570. {
  2571. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2572. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2573. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2574. if (!output) {
  2575. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
  2576. }
  2577. }
  2578. for (int i = 0; i < n_layer; ++i) {
  2579. auto & layer = layers[i];
  2580. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2581. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2582. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2583. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2584. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2585. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2586. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2587. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2588. }
  2589. } break;
  2590. case LLM_ARCH_STARCODER:
  2591. {
  2592. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2593. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  2594. // output
  2595. {
  2596. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2597. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2598. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2599. if (!output) {
  2600. // needs to be on GPU
  2601. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2602. }
  2603. }
  2604. for (int i = 0; i < n_layer; ++i) {
  2605. auto & layer = layers[i];
  2606. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2607. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2608. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2609. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2610. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2611. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2612. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2613. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2614. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2615. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2616. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2617. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2618. }
  2619. } break;
  2620. case LLM_ARCH_BERT:
  2621. case LLM_ARCH_NOMIC_BERT:
  2622. case LLM_ARCH_NOMIC_BERT_MOE:
  2623. case LLM_ARCH_JINA_BERT_V3:
  2624. {
  2625. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2626. type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
  2627. if (arch == LLM_ARCH_BERT) {
  2628. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  2629. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
  2630. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  2631. cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2632. cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2633. }
  2634. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  2635. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  2636. for (int i = 0; i < n_layer; ++i) {
  2637. auto & layer = layers[i];
  2638. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2639. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2640. if (!layer.wqkv) {
  2641. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2642. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2643. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2644. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2645. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2646. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2647. }
  2648. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2649. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2650. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  2651. layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
  2652. if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
  2653. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
  2654. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  2655. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2656. } else {
  2657. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2658. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2659. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2660. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2661. if (arch == LLM_ARCH_NOMIC_BERT) {
  2662. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2663. }
  2664. }
  2665. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  2666. layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
  2667. }
  2668. } break;
  2669. case LLM_ARCH_NEO_BERT:
  2670. {
  2671. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2672. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
  2673. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  2674. cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2675. cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2676. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2677. for (int i = 0; i < n_layer; ++i) {
  2678. auto & layer = layers[i];
  2679. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2680. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2681. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2682. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2683. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff*2}, 0);
  2684. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2685. }
  2686. } break;
  2687. case LLM_ARCH_JINA_BERT_V2:
  2688. {
  2689. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
  2690. type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
  2691. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
  2692. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); //LayerNorm bias
  2693. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
  2694. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {1}, TENSOR_NOT_REQUIRED);
  2695. for (int i = 0; i < n_layer; ++i) {
  2696. auto & layer = layers[i]; // JinaBertLayer
  2697. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2698. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2699. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2700. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2701. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2702. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2703. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2704. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2705. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2706. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2707. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
  2708. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
  2709. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
  2710. layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
  2711. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2712. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2713. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2714. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
  2715. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2716. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2717. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  2718. layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
  2719. }
  2720. } break;
  2721. case LLM_ARCH_BLOOM:
  2722. {
  2723. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2724. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  2725. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  2726. // output
  2727. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2728. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2729. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2730. // if output is NULL, init from the input tok embed
  2731. if (output == NULL) {
  2732. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2733. }
  2734. for (int i = 0; i < n_layer; ++i) {
  2735. auto & layer = layers[i];
  2736. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2737. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2738. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2739. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2740. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2741. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2742. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2743. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2744. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2745. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2746. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2747. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2748. }
  2749. } break;
  2750. case LLM_ARCH_MPT:
  2751. {
  2752. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2753. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
  2754. // output
  2755. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2756. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  2757. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2758. if (!output) {
  2759. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
  2760. }
  2761. for (int i = 0; i < n_layer; ++i) {
  2762. auto & layer = layers[i];
  2763. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2764. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2765. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2766. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2767. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2768. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2769. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2770. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2771. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2772. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2773. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2774. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2775. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2776. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2777. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2778. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2779. // AWQ ScaleActivation layer
  2780. layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2781. }
  2782. } break;
  2783. case LLM_ARCH_STABLELM:
  2784. {
  2785. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2786. // output
  2787. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2788. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2789. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2790. for (int i = 0; i < n_layer; ++i) {
  2791. auto & layer = layers[i];
  2792. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2793. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2794. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2795. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2796. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2797. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2798. // optional bias tensors, present in Stable LM 2 1.6B
  2799. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2800. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2801. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2802. // optional q and k layernorms, present in StableLM 2 12B
  2803. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
  2804. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
  2805. // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
  2806. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2807. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2808. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2809. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2810. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2811. }
  2812. } break;
  2813. case LLM_ARCH_QWEN:
  2814. {
  2815. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2816. // output
  2817. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2818. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2819. for (int i = 0; i < n_layer; ++i) {
  2820. auto & layer = layers[i];
  2821. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2822. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
  2823. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
  2824. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2825. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2826. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
  2827. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
  2828. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}, 0);
  2829. }
  2830. } break;
  2831. case LLM_ARCH_QWEN2:
  2832. case LLM_ARCH_QWEN2VL:
  2833. case LLM_ARCH_DREAM:
  2834. {
  2835. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2836. // output
  2837. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2838. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2839. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, TENSOR_NOT_REQUIRED);
  2840. // if output is NULL, init from the input tok embed
  2841. if (output == NULL) {
  2842. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2843. }
  2844. for (int i = 0; i < n_layer; ++i) {
  2845. auto & layer = layers[i];
  2846. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2847. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2848. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2849. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2850. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2851. // optional bias tensors
  2852. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2853. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2854. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2855. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2856. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2857. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2858. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2859. }
  2860. } break;
  2861. case LLM_ARCH_QWEN2MOE:
  2862. {
  2863. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2864. // output
  2865. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2866. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2867. for (int i = 0; i < n_layer; ++i) {
  2868. auto & layer = layers[i];
  2869. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2870. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2871. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2872. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2873. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2874. // optional bias tensors
  2875. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2876. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2877. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2878. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2879. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2880. if (n_expert == 0) {
  2881. throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
  2882. }
  2883. if (n_expert_used == 0) {
  2884. throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
  2885. }
  2886. // MoE branch
  2887. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2888. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2889. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2890. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2891. // Shared expert branch
  2892. const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
  2893. layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
  2894. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2895. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
  2896. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2897. }
  2898. } break;
  2899. case LLM_ARCH_QWEN3:
  2900. {
  2901. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2902. // output
  2903. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2904. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2905. // if output is NULL, init from the input tok embed
  2906. if (output == NULL) {
  2907. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2908. }
  2909. for (int i = 0; i < n_layer; ++i) {
  2910. auto & layer = layers[i];
  2911. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2912. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2913. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2914. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2915. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2916. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2917. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2918. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2919. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2920. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2921. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2922. }
  2923. } break;
  2924. case LLM_ARCH_QWEN3MOE:
  2925. {
  2926. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2927. // output
  2928. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2929. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2930. // if output is NULL, init from the input tok embed
  2931. if (output == NULL) {
  2932. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2933. }
  2934. for (int i = 0; i < n_layer; ++i) {
  2935. auto & layer = layers[i];
  2936. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2937. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2938. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2939. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2940. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2941. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2942. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2943. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2944. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2945. if (n_expert == 0) {
  2946. throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
  2947. }
  2948. if (n_expert_used == 0) {
  2949. throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
  2950. }
  2951. // MoE branch
  2952. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2953. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2954. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2955. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2956. }
  2957. } break;
  2958. case LLM_ARCH_PHI2:
  2959. {
  2960. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2961. // output
  2962. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2963. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2964. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2965. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, 0);
  2966. for (int i = 0; i < n_layer; ++i) {
  2967. auto & layer = layers[i];
  2968. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2969. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2970. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2971. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2972. if (layer.wqkv == nullptr) {
  2973. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2974. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2975. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2976. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2977. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2978. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2979. }
  2980. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2981. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2982. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2983. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2984. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2985. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2986. }
  2987. } break;
  2988. case LLM_ARCH_PHI3:
  2989. {
  2990. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2991. // output
  2992. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2993. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2994. // if output is NULL, init from the input tok embed
  2995. if (output == NULL) {
  2996. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2997. }
  2998. for (int i = 0; i < n_layer; ++i) {
  2999. auto & layer = layers[i];
  3000. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  3001. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
  3002. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  3003. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  3004. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  3005. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
  3006. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3007. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3008. }
  3009. } break;
  3010. case LLM_ARCH_PHIMOE:
  3011. {
  3012. const int64_t n_embd_head = n_embd / n_head;
  3013. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  3014. // output
  3015. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  3016. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3017. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
  3018. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0);
  3019. for (int i = 0; i < n_layer; ++i) {
  3020. auto & layer = layers[i];
  3021. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  3022. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
  3023. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
  3024. if (layer.wqkv == nullptr) {
  3025. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3026. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  3027. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3028. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  3029. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3030. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  3031. }
  3032. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  3033. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
  3034. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  3035. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
  3036. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3037. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3038. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  3039. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3040. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3041. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3042. }
  3043. } break;
  3044. case LLM_ARCH_PLAMO:
  3045. {
  3046. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3047. // output
  3048. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3049. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3050. for (int i = 0; i < n_layer; ++i) {
  3051. auto & layer = layers[i];
  3052. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3053. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3054. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3055. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3056. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3057. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3058. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3059. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3060. }
  3061. } break;
  3062. case LLM_ARCH_PLAMO2:
  3063. {
  3064. const uint32_t d_conv = hparams.ssm_d_conv;
  3065. const uint32_t d_state = hparams.ssm_d_state;
  3066. const uint32_t num_heads = hparams.ssm_dt_rank;
  3067. const uint32_t intermediate_size = hparams.ssm_d_inner;
  3068. const uint32_t head_dim = intermediate_size / num_heads;
  3069. const uint32_t qk_dim = head_dim;
  3070. const uint32_t v_dim = head_dim;
  3071. const int64_t num_attention_heads = hparams.n_head();
  3072. const int64_t q_num_heads = num_attention_heads;
  3073. const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
  3074. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3075. // output
  3076. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3077. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3078. // if output is NULL, init from the input tok embed
  3079. if (output == NULL) {
  3080. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3081. }
  3082. for (int i = 0; i < n_layer; ++i) {
  3083. auto & layer = layers[i];
  3084. bool is_mamba_layer = hparams.is_recurrent(i);
  3085. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3086. if (is_mamba_layer) {
  3087. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2 * intermediate_size}, 0);
  3088. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, intermediate_size}, 0);
  3089. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {intermediate_size, dt_dim + 2*d_state}, 0);
  3090. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_dim, num_heads}, 0);
  3091. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {num_heads}, 0);
  3092. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {num_heads}, 0);
  3093. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {num_heads}, 0);
  3094. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {intermediate_size, n_embd}, 0);
  3095. layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, i), {dt_dim}, 0);
  3096. layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
  3097. layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
  3098. } else {
  3099. const int64_t num_key_value_heads = hparams.n_head_kv(i);
  3100. const int64_t k_num_heads = num_key_value_heads;
  3101. const int64_t v_num_heads = num_key_value_heads;
  3102. const int64_t q_proj_dim = q_num_heads * qk_dim;
  3103. const int64_t k_proj_dim = k_num_heads * qk_dim;
  3104. const int64_t v_proj_dim = v_num_heads * v_dim;
  3105. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
  3106. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim, num_attention_heads}, 0);
  3107. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim, k_num_heads}, 0);
  3108. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
  3109. }
  3110. // All layers have post-attention norm, FFN norm, and FFN tensors
  3111. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
  3112. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3113. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3114. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  3115. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
  3116. }
  3117. } break;
  3118. case LLM_ARCH_GPT2:
  3119. {
  3120. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3121. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  3122. // output
  3123. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3124. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3125. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3126. // if output is NULL, init from the input tok embed
  3127. if (output == NULL) {
  3128. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3129. }
  3130. for (int i = 0; i < n_layer; ++i) {
  3131. auto & layer = layers[i];
  3132. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3133. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3134. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3135. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  3136. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3137. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3138. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3139. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3140. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3141. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3142. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3143. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  3144. }
  3145. } break;
  3146. case LLM_ARCH_CODESHELL:
  3147. {
  3148. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3149. // if tok embd is NULL, init from output
  3150. if (tok_embd == NULL) {
  3151. tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3152. }
  3153. // output
  3154. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3155. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3156. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3157. for (int i = 0; i < n_layer; ++i) {
  3158. auto & layer = layers[i];
  3159. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3160. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3161. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3162. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  3163. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3164. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3165. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3166. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3167. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3168. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3169. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3170. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  3171. }
  3172. } break;
  3173. case LLM_ARCH_ORION:
  3174. {
  3175. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3176. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3177. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3178. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3179. for (int i = 0; i < n_layer; ++i) {
  3180. auto & layer = layers[i];
  3181. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3182. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3183. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3184. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3185. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3186. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3187. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3188. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3189. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3190. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3191. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3192. }
  3193. } break;
  3194. case LLM_ARCH_INTERNLM2:
  3195. {
  3196. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3197. // output
  3198. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3199. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3200. for (int i = 0; i < n_layer; ++i) {
  3201. auto & layer = layers[i];
  3202. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3203. // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3204. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3205. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3206. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3207. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3208. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3209. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3210. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3211. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3212. }
  3213. } break;
  3214. case LLM_ARCH_GEMMA:
  3215. {
  3216. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3217. // output
  3218. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3219. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  3220. for (int i = 0; i < n_layer; ++i) {
  3221. auto & layer = layers[i];
  3222. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3223. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3224. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3225. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3226. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3227. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3228. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3229. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3230. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3231. }
  3232. } break;
  3233. case LLM_ARCH_GEMMA2:
  3234. {
  3235. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3236. // output
  3237. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3238. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  3239. for (int i = 0; i < n_layer; ++i) {
  3240. auto & layer = layers[i];
  3241. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3242. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3243. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3244. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3245. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3246. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3247. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3248. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3249. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3250. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3251. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3252. }
  3253. } break;
  3254. case LLM_ARCH_GEMMA3:
  3255. case LLM_ARCH_GEMMA_EMBEDDING:
  3256. {
  3257. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3258. // output
  3259. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3260. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3261. // if output is NULL, init from the input tok embed
  3262. if (output == NULL) {
  3263. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3264. }
  3265. for (int i = 0; i < n_layer; ++i) {
  3266. auto & layer = layers[i];
  3267. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3268. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3269. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3270. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3271. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3272. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3273. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  3274. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  3275. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3276. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3277. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3278. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3279. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3280. }
  3281. } break;
  3282. case LLM_ARCH_GEMMA3N:
  3283. {
  3284. const int64_t n_altup = hparams.n_altup;
  3285. const int64_t laurel_rank = hparams.laurel_rank;
  3286. const int64_t n_embd_altup = hparams.n_embd_altup;
  3287. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3288. // if output is NULL, init from the input tok embed
  3289. if (output == NULL) {
  3290. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3291. }
  3292. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3293. tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0);
  3294. altup_proj = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
  3295. altup_unembd_proj = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
  3296. per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_altup * n_layer}, 0);
  3297. per_layer_proj_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM, "weight"), {n_embd_altup}, 0);
  3298. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3299. for (int i = 0; i < n_layer; ++i) {
  3300. auto & layer = layers[i];
  3301. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3302. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3303. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3304. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3305. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3306. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  3307. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  3308. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3309. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3310. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3311. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3312. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3313. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3314. // altup & laurel
  3315. layer.per_layer_inp_gate = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE, "weight", i), {n_embd, n_embd_altup}, 0);
  3316. layer.per_layer_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ, "weight", i), {n_embd_altup, n_embd}, 0);
  3317. layer.per_layer_post_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
  3318. layer.altup_correct_coef = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF, "weight", i), {n_altup, n_altup}, 0);
  3319. layer.altup_correct_scale = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0);
  3320. layer.altup_predict_coef = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF, "weight", i), {n_altup, n_altup * n_altup}, 0);
  3321. layer.altup_router = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER, "weight", i), {n_embd, n_altup}, 0);
  3322. layer.altup_router_norm = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM, "weight", i), {n_embd}, 0);
  3323. layer.laurel_l = create_tensor(tn(LLM_TENSOR_LAUREL_L, "weight", i), {n_embd, laurel_rank}, 0);
  3324. layer.laurel_r = create_tensor(tn(LLM_TENSOR_LAUREL_R, "weight", i), {laurel_rank, n_embd}, 0);
  3325. layer.laurel_post_norm = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM, "weight", i), {n_embd}, 0);
  3326. }
  3327. } break;
  3328. case LLM_ARCH_STARCODER2:
  3329. {
  3330. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3331. // output
  3332. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3333. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3334. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3335. // if output is NULL, init from the input tok embed
  3336. if (output == NULL) {
  3337. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3338. }
  3339. for (int i = 0; i < n_layer; ++i) {
  3340. auto & layer = layers[i];
  3341. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3342. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3343. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3344. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3345. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3346. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3347. // optional bias tensors
  3348. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  3349. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  3350. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  3351. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3352. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3353. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3354. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3355. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3356. // optional bias tensors
  3357. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3358. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}, 0);
  3359. }
  3360. } break;
  3361. case LLM_ARCH_MAMBA:
  3362. {
  3363. const int64_t d_conv = hparams.ssm_d_conv;
  3364. const int64_t d_inner = hparams.ssm_d_inner;
  3365. const int64_t d_state = hparams.ssm_d_state;
  3366. const int64_t dt_rank = hparams.ssm_dt_rank;
  3367. // only an expansion factor of 2 is supported for now
  3368. if (2 * n_embd != d_inner) {
  3369. throw std::runtime_error("only an expansion factor of 2 is supported for now");
  3370. }
  3371. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3372. // output
  3373. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3374. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3375. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3376. if (output == NULL) {
  3377. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3378. }
  3379. for (int i = 0; i < n_layer; ++i) {
  3380. auto & layer = layers[i];
  3381. // norm
  3382. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3383. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
  3384. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
  3385. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
  3386. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
  3387. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
  3388. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
  3389. // no "weight" suffix for these
  3390. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
  3391. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
  3392. // out_proj
  3393. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3394. }
  3395. } break;
  3396. case LLM_ARCH_MAMBA2:
  3397. {
  3398. const int64_t d_conv = hparams.ssm_d_conv;
  3399. const int64_t d_inner = hparams.ssm_d_inner;
  3400. const int64_t d_state = hparams.ssm_d_state;
  3401. const int64_t n_head = hparams.ssm_dt_rank;
  3402. const int64_t n_group = hparams.ssm_n_group;
  3403. const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
  3404. // only an expansion factor of 2 is supported for now
  3405. GGML_ASSERT(2 * n_embd == d_inner);
  3406. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3407. // output
  3408. {
  3409. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3410. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3411. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3412. if (output == NULL) {
  3413. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3414. }
  3415. }
  3416. for (int i = 0; i < n_layer; ++i) {
  3417. auto & layer = layers[i];
  3418. // norm
  3419. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3420. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
  3421. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
  3422. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);
  3423. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
  3424. // no "weight" suffix for these
  3425. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
  3426. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
  3427. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
  3428. // out_proj
  3429. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3430. }
  3431. } break;
  3432. case LLM_ARCH_JAMBA:
  3433. {
  3434. const int64_t d_conv = hparams.ssm_d_conv;
  3435. const int64_t d_inner = hparams.ssm_d_inner;
  3436. const int64_t d_state = hparams.ssm_d_state;
  3437. const int64_t dt_rank = hparams.ssm_dt_rank;
  3438. // only an expansion factor of 2 is supported for now
  3439. GGML_ASSERT(2 * n_embd == d_inner);
  3440. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3441. // output
  3442. {
  3443. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3444. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3445. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3446. if (output == NULL) {
  3447. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3448. }
  3449. }
  3450. for (int i = 0; i < n_layer; ++i) {
  3451. const int64_t n_head_kv = hparams.n_head_kv(i);
  3452. const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
  3453. auto & layer = layers[i];
  3454. // norm
  3455. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3456. if (n_head_kv == 0) {
  3457. // Mamba layer
  3458. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
  3459. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
  3460. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
  3461. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
  3462. layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, "weight", i), {dt_rank}, 0);
  3463. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
  3464. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
  3465. layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, "weight", i), {d_state}, 0);
  3466. layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, "weight", i), {d_state}, 0);
  3467. // no "weight" suffix for these
  3468. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
  3469. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
  3470. // out_proj
  3471. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3472. } else {
  3473. // Attention layers
  3474. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3475. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3476. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3477. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3478. }
  3479. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3480. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
  3481. if (layer.ffn_gate_inp) {
  3482. // MoE
  3483. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3484. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  3485. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3486. } else {
  3487. // FFN (no MoE)
  3488. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3489. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3490. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3491. }
  3492. }
  3493. } break;
  3494. case LLM_ARCH_GRANITE_HYBRID:
  3495. {
  3496. // mamba2 Mixer SSM params
  3497. // NOTE: int64_t for tensor dimensions
  3498. const int64_t d_conv = hparams.ssm_d_conv;
  3499. const int64_t d_inner = hparams.ssm_d_inner;
  3500. const int64_t d_state = hparams.ssm_d_state;
  3501. const int64_t n_ssm_head = hparams.ssm_dt_rank;
  3502. const int64_t n_group = hparams.ssm_n_group;
  3503. const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
  3504. // only an expansion factor of 2 is supported for now
  3505. GGML_ASSERT(2 * n_embd == d_inner);
  3506. // embeddings
  3507. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3508. // output
  3509. {
  3510. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3511. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3512. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3513. if (output == NULL) {
  3514. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3515. }
  3516. }
  3517. for (int i = 0; i < n_layer; ++i) {
  3518. auto & layer = layers[i];
  3519. // norm
  3520. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3521. if (hparams.is_recurrent(i)) {
  3522. // ssm layers
  3523. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
  3524. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
  3525. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
  3526. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
  3527. // no "weight" suffix for these
  3528. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
  3529. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
  3530. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
  3531. // out_proj
  3532. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3533. } else {
  3534. // attention layers (with optional bias)
  3535. const int64_t n_head_i = hparams.n_head(i);
  3536. const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
  3537. const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
  3538. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
  3539. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
  3540. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
  3541. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
  3542. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3543. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
  3544. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
  3545. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3546. }
  3547. // feed forward (w/ optional biases)
  3548. if (n_expert > 0) {
  3549. // MoE FFN
  3550. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3551. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3552. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3553. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
  3554. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  3555. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3556. // For Granite MoE Shared
  3557. if (hparams.n_ff_shexp > 0) {
  3558. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  3559. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  3560. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
  3561. }
  3562. } else {
  3563. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3564. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3565. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3566. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3567. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3568. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  3569. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3570. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  3571. }
  3572. }
  3573. } break;
  3574. case LLM_ARCH_XVERSE:
  3575. {
  3576. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3577. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3578. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3579. for (int i = 0; i < n_layer; ++i) {
  3580. auto & layer = layers[i];
  3581. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3582. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3583. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3584. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3585. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3586. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3587. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3588. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3589. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3590. }
  3591. } break;
  3592. case LLM_ARCH_COMMAND_R:
  3593. {
  3594. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3595. // output
  3596. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3597. // init output from the input tok embed
  3598. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3599. for (int i = 0; i < n_layer; ++i) {
  3600. auto & layer = layers[i];
  3601. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3602. if (n_layer >= 64){
  3603. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
  3604. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
  3605. }
  3606. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3607. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3608. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3609. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3610. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3611. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3612. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3613. }
  3614. } break;
  3615. case LLM_ARCH_COHERE2:
  3616. {
  3617. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  3618. // output
  3619. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  3620. // init output from the input tok embed
  3621. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
  3622. TENSOR_DUPLICATED);
  3623. for (int i = 0; i < n_layer; ++i) {
  3624. auto & layer = layers[i];
  3625. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  3626. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
  3627. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
  3628. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
  3629. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  3630. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
  3631. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  3632. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
  3633. }
  3634. }
  3635. break;
  3636. case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
  3637. {
  3638. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3639. // output
  3640. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3641. // if output is NULL, init from the input tok embed
  3642. if (output == NULL) {
  3643. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3644. }
  3645. for (int i = 0; i < n_layer; ++i) {
  3646. auto & layer = layers[i];
  3647. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3648. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3649. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3650. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3651. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3652. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3653. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3654. }
  3655. } break;
  3656. case LLM_ARCH_OLMO2:
  3657. {
  3658. const int64_t n_embd_head = n_embd / n_head;
  3659. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3660. // output
  3661. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3662. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3663. for (int i = 0; i < n_layer; ++i) {
  3664. auto & layer = layers[i];
  3665. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3666. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3667. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3668. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3669. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
  3670. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
  3671. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3672. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3673. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3674. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3675. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3676. }
  3677. } break;
  3678. case LLM_ARCH_SEED_OSS:
  3679. {
  3680. const uint32_t head_dim = hparams.n_embd_head_k;
  3681. const int64_t n_qo_dim = n_head * head_dim;
  3682. const int64_t n_kv_dim = n_head_kv * head_dim;
  3683. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3684. // output
  3685. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3686. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3687. // if output is NULL, init from the input tok embed
  3688. if (output == NULL) {
  3689. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3690. }
  3691. for (int i = 0; i < n_layer; ++i) {
  3692. auto & layer = layers[i];
  3693. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, 0);
  3694. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, 0);
  3695. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, 0);
  3696. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
  3697. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_qo_dim}, TENSOR_NOT_REQUIRED);
  3698. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
  3699. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
  3700. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3701. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3702. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3703. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3704. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3705. }
  3706. } break;
  3707. case LLM_ARCH_OLMOE:
  3708. {
  3709. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3710. // output
  3711. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3712. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3713. for (int i = 0; i < n_layer; ++i) {
  3714. auto & layer = layers[i];
  3715. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3716. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3717. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3718. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3719. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3720. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
  3721. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
  3722. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3723. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3724. if (n_expert == 0) {
  3725. throw std::runtime_error("n_expert must be > 0");
  3726. }
  3727. if (n_expert_used == 0) {
  3728. throw std::runtime_error("n_expert_used must be > 0");
  3729. }
  3730. // MoE branch
  3731. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3732. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  3733. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3734. }
  3735. } break;
  3736. case LLM_ARCH_OPENELM:
  3737. {
  3738. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3739. // output
  3740. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3741. // init output from the input tok embed
  3742. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3743. for (int i = 0; i < n_layer; ++i) {
  3744. const int64_t n_head = hparams.n_head(i);
  3745. const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head;
  3746. const int64_t n_ff = hparams.n_ff(i);
  3747. auto & layer = layers[i];
  3748. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3749. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
  3750. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  3751. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  3752. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
  3753. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3754. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3755. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3756. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3757. }
  3758. } break;
  3759. case LLM_ARCH_GPTNEOX:
  3760. {
  3761. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3762. // output
  3763. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3764. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3765. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3766. for (int i = 0; i < n_layer; ++i) {
  3767. auto & layer = layers[i];
  3768. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3769. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3770. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3771. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  3772. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3773. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3774. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3775. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3776. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3777. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3778. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3779. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  3780. }
  3781. } break;
  3782. case LLM_ARCH_ARCTIC:
  3783. {
  3784. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3785. // output
  3786. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3787. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3788. // if output is NULL, init from the input tok embed
  3789. if (output == NULL) {
  3790. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3791. }
  3792. for (int i = 0; i < n_layer; ++i) {
  3793. auto & layer = layers[i];
  3794. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3795. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3796. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3797. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3798. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3799. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3800. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
  3801. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
  3802. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd}, 0);
  3803. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3804. layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
  3805. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
  3806. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  3807. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3808. }
  3809. } break;
  3810. case LLM_ARCH_DEEPSEEK:
  3811. {
  3812. const int64_t n_ff_exp = hparams.n_ff_exp;
  3813. const int64_t n_expert_shared = hparams.n_expert_shared;
  3814. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3815. // output
  3816. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3817. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3818. for (int i = 0; i < n_layer; ++i) {
  3819. auto & layer = layers[i];
  3820. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3821. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3822. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3823. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3824. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3825. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3826. if (i < (int) hparams.n_layer_dense_lead) {
  3827. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3828. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3829. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3830. } else {
  3831. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3832. if (n_expert == 0) {
  3833. throw std::runtime_error("n_expert must be > 0");
  3834. }
  3835. if (n_expert_used == 0) {
  3836. throw std::runtime_error("n_expert_used must be > 0");
  3837. }
  3838. // MoE branch
  3839. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3840. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  3841. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3842. // Shared expert branch
  3843. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3844. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  3845. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3846. }
  3847. }
  3848. } break;
  3849. case LLM_ARCH_DEEPSEEK2:
  3850. {
  3851. const bool is_lite = (hparams.n_layer == 27);
  3852. const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
  3853. // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
  3854. const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
  3855. const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
  3856. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  3857. const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
  3858. const int64_t q_lora_rank = hparams.n_lora_q;
  3859. const int64_t kv_lora_rank = hparams.n_lora_kv;
  3860. const int64_t n_ff_exp = hparams.n_ff_exp;
  3861. const int64_t n_expert_shared = hparams.n_expert_shared;
  3862. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3863. // output
  3864. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3865. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3866. for (int i = 0; i < n_layer; ++i) {
  3867. auto & layer = layers[i];
  3868. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3869. if (!is_lite) {
  3870. layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
  3871. }
  3872. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  3873. if (!is_lite) {
  3874. layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
  3875. layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
  3876. } else {
  3877. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
  3878. }
  3879. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
  3880. // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
  3881. if (is_mla) {
  3882. layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
  3883. layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
  3884. } else {
  3885. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
  3886. }
  3887. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
  3888. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3889. if (i < (int) hparams.n_layer_dense_lead) {
  3890. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3891. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3892. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3893. } else {
  3894. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3895. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  3896. if (n_expert == 0) {
  3897. throw std::runtime_error("n_expert must be > 0");
  3898. }
  3899. if (n_expert_used == 0) {
  3900. throw std::runtime_error("n_expert_used must be > 0");
  3901. }
  3902. // MoE branch
  3903. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3904. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  3905. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3906. // Shared expert branch
  3907. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3908. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  3909. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3910. }
  3911. }
  3912. } break;
  3913. case LLM_ARCH_PLM:
  3914. {
  3915. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  3916. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  3917. const int64_t kv_lora_rank = hparams.n_lora_kv;
  3918. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3919. // output
  3920. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3921. // output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3922. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3923. for (int i = 0; i < n_layer; ++i) {
  3924. auto & layer = layers[i];
  3925. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3926. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3927. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
  3928. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  3929. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
  3930. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
  3931. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3932. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3933. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3934. }
  3935. } break;
  3936. case LLM_ARCH_BITNET:
  3937. {
  3938. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3939. // output
  3940. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3941. for (int i = 0; i < n_layer; ++i) {
  3942. auto & layer = layers[i];
  3943. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3944. layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
  3945. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3946. layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3947. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3948. layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3949. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3950. layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3951. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3952. layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3953. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3954. layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
  3955. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3956. layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3957. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3958. layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3959. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3960. layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3961. }
  3962. } break;
  3963. case LLM_ARCH_T5:
  3964. {
  3965. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  3966. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3967. // output
  3968. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3969. output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3970. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3971. // if output is NULL, init from the input tok embed
  3972. if (output == NULL) {
  3973. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3974. }
  3975. // n_layer: number of encoder_layers
  3976. // dec_n_layer: number of decoder_layers
  3977. const int dec_n_layer = hparams.dec_n_layer;
  3978. if (dec_n_layer > n_layer) {
  3979. layers.resize(dec_n_layer);
  3980. }
  3981. // load encoder layers
  3982. for (int i = 0; i < n_layer; ++i) {
  3983. auto & layer = layers[i];
  3984. layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
  3985. layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  3986. layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3987. layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3988. layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3989. layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  3990. layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
  3991. layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  3992. layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3993. layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3994. }
  3995. // load decoder layers
  3996. for (int i = 0; i < dec_n_layer; ++i) {
  3997. auto & layer = layers[i];
  3998. layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
  3999. layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  4000. layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4001. layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4002. layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4003. layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  4004. layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
  4005. // this tensor seems to be unused in HF transformers implementation
  4006. layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  4007. layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4008. layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4009. layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4010. layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  4011. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
  4012. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  4013. layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4014. layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4015. }
  4016. } break;
  4017. case LLM_ARCH_T5ENCODER:
  4018. {
  4019. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  4020. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4021. // output
  4022. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4023. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4024. // if output is NULL, init from the input tok embed
  4025. if (output == NULL) {
  4026. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4027. }
  4028. for (int i = 0; i < n_layer; ++i) {
  4029. auto & layer = layers[i];
  4030. layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
  4031. layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  4032. layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4033. layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4034. layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4035. layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  4036. layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
  4037. layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  4038. layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4039. layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4040. }
  4041. } break;
  4042. case LLM_ARCH_JAIS:
  4043. {
  4044. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4045. // output
  4046. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4047. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4048. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4049. for (int i = 0; i < n_layer; ++i) {
  4050. auto & layer = layers[i];
  4051. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4052. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  4053. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  4054. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  4055. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4056. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  4057. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4058. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  4059. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  4060. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  4061. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4062. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, 0);
  4063. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4064. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  4065. }
  4066. } break;
  4067. case LLM_ARCH_CHATGLM:
  4068. {
  4069. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4070. // output
  4071. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4072. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4073. // if output is NULL, init from the input tok embed
  4074. if (output == NULL) {
  4075. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4076. }
  4077. for (int i = 0; i < n_layer; ++i) {
  4078. auto & layer = layers[i];
  4079. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4080. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4081. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4082. if (layer.wqkv == nullptr) {
  4083. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4084. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4085. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4086. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4087. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4088. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4089. }
  4090. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4091. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4092. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  4093. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  4094. }
  4095. } break;
  4096. case LLM_ARCH_GLM4:
  4097. {
  4098. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4099. // output
  4100. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4101. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4102. // if output is NULL, init from the input tok embed
  4103. if (output == NULL) {
  4104. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4105. }
  4106. for (int i = 0; i < n_layer; ++i) {
  4107. auto & layer = layers[i];
  4108. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4109. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4110. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4111. if (layer.wqkv == nullptr) {
  4112. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4113. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4114. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4115. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4116. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4117. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4118. }
  4119. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4120. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  4121. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4122. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4123. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  4124. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  4125. }
  4126. } break;
  4127. case LLM_ARCH_GLM4_MOE:
  4128. {
  4129. const int64_t n_expert = hparams.n_expert;
  4130. const int64_t n_expert_used = hparams.n_expert_used;
  4131. const int64_t n_expert_shared = hparams.n_expert_shared;
  4132. GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
  4133. GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
  4134. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  4135. // output
  4136. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  4137. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
  4138. // if output is NULL, init from the input tok embed
  4139. if (output == NULL) {
  4140. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
  4141. }
  4142. // Load ALL tensors including NextN layer to satisfy total tensor count
  4143. // but only PROCESS up to last layer (skipping final NextN layer) in forward pass
  4144. for (int i = 0; i < n_layer; ++i) {
  4145. int flags = 0;
  4146. if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
  4147. // skip all tensors in the NextN layers
  4148. flags |= TENSOR_SKIP;
  4149. }
  4150. auto & layer = layers[i];
  4151. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
  4152. // GLM-style attention with bias terms
  4153. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
  4154. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
  4155. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
  4156. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
  4157. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
  4158. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
  4159. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
  4160. // K/Q norm tensors (optional for GLM-4.5 355B variant)
  4161. layer.attn_q_norm = create_tensor(
  4162. tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
  4163. layer.attn_k_norm = create_tensor(
  4164. tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
  4165. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
  4166. // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
  4167. // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
  4168. const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
  4169. if (use_moe) {
  4170. // MoE layers
  4171. layer.ffn_gate_inp =
  4172. create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
  4173. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
  4174. // MoE branch
  4175. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  4176. layer.ffn_gate_exps = create_tensor(
  4177. tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
  4178. layer.ffn_down_exps = create_tensor(
  4179. tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
  4180. layer.ffn_up_exps = create_tensor(
  4181. tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
  4182. // Shared expert
  4183. if (n_expert_shared > 0) {
  4184. const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
  4185. layer.ffn_gate_shexp = create_tensor(
  4186. tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
  4187. layer.ffn_down_shexp = create_tensor(
  4188. tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
  4189. layer.ffn_up_shexp = create_tensor(
  4190. tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
  4191. }
  4192. } else {
  4193. // Dense layers (first k layers) - GLM uses separate gate/up projections
  4194. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
  4195. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
  4196. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags);
  4197. }
  4198. // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
  4199. if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
  4200. layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
  4201. layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags);
  4202. layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
  4203. layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
  4204. layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags);
  4205. layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags);
  4206. }
  4207. }
  4208. }
  4209. break;
  4210. case LLM_ARCH_NEMOTRON:
  4211. {
  4212. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4213. // output
  4214. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4215. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4216. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4217. for (int i = 0; i < n_layer; ++i) {
  4218. auto & layer = layers[i];
  4219. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4220. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  4221. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  4222. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  4223. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  4224. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4225. // optional bias tensors
  4226. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4227. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4228. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4229. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4230. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4231. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  4232. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4233. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4234. // optional MLP bias
  4235. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4236. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  4237. }
  4238. } break;
  4239. case LLM_ARCH_NEMOTRON_H:
  4240. {
  4241. // mamba2 Mixer SSM params
  4242. // NOTE: int64_t for tensor dimensions
  4243. const int64_t d_conv = hparams.ssm_d_conv;
  4244. const int64_t d_inner = hparams.ssm_d_inner;
  4245. const int64_t d_state = hparams.ssm_d_state;
  4246. const int64_t n_ssm_head = hparams.ssm_dt_rank;
  4247. const int64_t n_group = hparams.ssm_n_group;
  4248. const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
  4249. // embeddings
  4250. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4251. // output
  4252. {
  4253. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4254. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4255. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  4256. if (output == NULL) {
  4257. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4258. }
  4259. }
  4260. for (int i = 0; i < n_layer; ++i) {
  4261. auto & layer = layers[i];
  4262. // all blocks use the attn norm
  4263. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4264. if (hparams.is_recurrent(i)) {
  4265. // ssm layers
  4266. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
  4267. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
  4268. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
  4269. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
  4270. // no "weight" suffix for these
  4271. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
  4272. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
  4273. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
  4274. // out_proj
  4275. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  4276. } else if (hparams.n_ff(i) == 0) {
  4277. // attention layers (with optional bias)
  4278. const int64_t n_head_i = hparams.n_head(i);
  4279. const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
  4280. const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
  4281. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
  4282. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
  4283. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
  4284. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
  4285. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4286. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
  4287. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
  4288. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4289. } else {
  4290. // mlp layers
  4291. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
  4292. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
  4293. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4294. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
  4295. }
  4296. }
  4297. } break;
  4298. case LLM_ARCH_EXAONE:
  4299. {
  4300. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4301. // output
  4302. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4303. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4304. // if output is NULL, init from the input tok embed
  4305. if (output == NULL) {
  4306. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4307. }
  4308. for (int i = 0; i < n_layer; ++i) {
  4309. auto & layer = layers[i];
  4310. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4311. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4312. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4313. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4314. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4315. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4316. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4317. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4318. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4319. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4320. }
  4321. } break;
  4322. case LLM_ARCH_EXAONE4:
  4323. {
  4324. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4325. // output
  4326. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4327. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4328. // if output is NULL, init from the input tok embed
  4329. if (output == NULL) {
  4330. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4331. }
  4332. for (int i = 0; i < n_layer; ++i) {
  4333. auto & layer = layers[i];
  4334. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4335. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4336. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4337. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4338. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4339. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  4340. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4341. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4342. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4343. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4344. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4345. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  4346. }
  4347. } break;
  4348. case LLM_ARCH_RWKV6:
  4349. {
  4350. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4351. // Block 0, LN0
  4352. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  4353. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  4354. // output
  4355. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4356. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4357. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4358. const int time_mix_extra_dim = hparams.time_mix_extra_dim;
  4359. const int time_decay_extra_dim = hparams.time_decay_extra_dim;
  4360. const int head_size = hparams.wkv_head_size;
  4361. const int attn_hidden_size = n_embd;
  4362. const int ffn_size = hparams.n_ff_arr[0];
  4363. for (int i = 0; i < n_layer; ++i) {
  4364. auto & layer = layers[i];
  4365. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4366. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  4367. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
  4368. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
  4369. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
  4370. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
  4371. layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
  4372. layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4373. layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4374. layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4375. layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4376. layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4377. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED);
  4378. GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
  4379. layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
  4380. layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
  4381. layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
  4382. layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
  4383. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  4384. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4385. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4386. layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4387. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
  4388. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
  4389. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4390. layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
  4391. layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
  4392. layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
  4393. layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
  4394. layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
  4395. }
  4396. } break;
  4397. case LLM_ARCH_RWKV6QWEN2:
  4398. {
  4399. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4400. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4401. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  4402. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4403. const int time_mix_extra_dim = hparams.time_mix_extra_dim;
  4404. const int time_decay_extra_dim = hparams.time_decay_extra_dim;
  4405. const int head_size = hparams.wkv_head_size;
  4406. const int attn_hidden_size = n_embd;
  4407. const int n_head_kv = hparams.n_head_kv();
  4408. int attn_key_value_size;
  4409. if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
  4410. attn_key_value_size = attn_hidden_size;
  4411. } else {
  4412. attn_key_value_size = n_head_kv * head_size;
  4413. }
  4414. for (int i = 0; i < n_layer; ++i) {
  4415. auto & layer = layers[i];
  4416. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4417. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
  4418. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
  4419. layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
  4420. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
  4421. layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, TENSOR_NOT_REQUIRED);
  4422. layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
  4423. layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
  4424. layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
  4425. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
  4426. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
  4427. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4428. layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4429. // optional bias tensors
  4430. layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
  4431. layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
  4432. layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, TENSOR_NOT_REQUIRED);
  4433. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4434. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4435. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4436. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4437. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4438. }
  4439. } break;
  4440. case LLM_ARCH_RWKV7:
  4441. {
  4442. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4443. // Block 0, LN0
  4444. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  4445. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  4446. // output
  4447. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4448. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4449. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4450. const int n_lora_decay = hparams.n_lora_decay;
  4451. const int n_lora_iclr = hparams.n_lora_iclr;
  4452. const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
  4453. const int n_lora_gate = hparams.n_lora_gate;
  4454. const int attn_hidden_size = n_embd;
  4455. const int ffn_size = hparams.n_ff_arr[0];
  4456. for (int i = 0; i < n_layer; ++i) {
  4457. auto & layer = layers[i];
  4458. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4459. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  4460. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
  4461. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
  4462. layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
  4463. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
  4464. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
  4465. layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
  4466. layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4467. layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4468. if (i == 0) {
  4469. // actually not used
  4470. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4471. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4472. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4473. } else {
  4474. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4475. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
  4476. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
  4477. }
  4478. layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, 0);
  4479. layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, 0);
  4480. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
  4481. layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
  4482. layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
  4483. layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
  4484. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  4485. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4486. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4487. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
  4488. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
  4489. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4490. layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
  4491. layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
  4492. layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
  4493. }
  4494. } break;
  4495. case LLM_ARCH_ARWKV7:
  4496. {
  4497. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4498. // output
  4499. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4500. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4501. const int n_lora_decay = hparams.n_lora_decay;
  4502. const int n_lora_iclr = hparams.n_lora_iclr;
  4503. const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
  4504. const int n_lora_gate = hparams.n_lora_gate;
  4505. const int attn_hidden_size = n_embd;
  4506. for (int i = 0; i < n_layer; ++i) {
  4507. auto & layer = layers[i];
  4508. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4509. layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
  4510. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
  4511. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
  4512. layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
  4513. layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4514. layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4515. if (i == 0) {
  4516. // actually not used
  4517. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4518. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4519. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4520. } else {
  4521. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4522. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
  4523. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
  4524. }
  4525. layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, TENSOR_NOT_REQUIRED);
  4526. layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, TENSOR_NOT_REQUIRED);
  4527. try {
  4528. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
  4529. } catch(std::runtime_error & e) {
  4530. // ARWKV models may not have gate tensors
  4531. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
  4532. }
  4533. layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
  4534. layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
  4535. layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
  4536. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  4537. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4538. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4539. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4540. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4541. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4542. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4543. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4544. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4545. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4546. }
  4547. } break;
  4548. case LLM_ARCH_CHAMELEON:
  4549. {
  4550. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4551. // output
  4552. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4553. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4554. // if output is NULL, init from the input tok embed
  4555. if (output == NULL) {
  4556. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4557. }
  4558. for (int i = 0; i < n_layer; ++i) {
  4559. auto & layer = layers[i];
  4560. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4561. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
  4562. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
  4563. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
  4564. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
  4565. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  4566. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  4567. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  4568. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4569. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4570. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4571. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4572. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4573. }
  4574. } break;
  4575. case LLM_ARCH_WAVTOKENIZER_DEC:
  4576. {
  4577. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
  4578. conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
  4579. conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
  4580. // posnet
  4581. {
  4582. const int64_t n_embd = hparams.posnet.n_embd;
  4583. for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
  4584. auto & layer = layers[i].posnet;
  4585. // posnet:
  4586. //
  4587. // - resnet
  4588. // - resnet
  4589. // - attn
  4590. // - resnet
  4591. // - resnet
  4592. // - norm
  4593. //
  4594. switch (i) {
  4595. case 0:
  4596. case 1:
  4597. case 3:
  4598. case 4:
  4599. {
  4600. layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
  4601. layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
  4602. layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
  4603. layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
  4604. layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
  4605. layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
  4606. layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
  4607. layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
  4608. } break;
  4609. case 2:
  4610. {
  4611. layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
  4612. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
  4613. layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
  4614. layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
  4615. layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
  4616. layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
  4617. layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
  4618. layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
  4619. layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
  4620. layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
  4621. } break;
  4622. case 5:
  4623. {
  4624. layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
  4625. layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
  4626. } break;
  4627. default: GGML_ABORT("unknown posnet layer");
  4628. };
  4629. }
  4630. }
  4631. GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
  4632. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
  4633. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
  4634. // convnext
  4635. {
  4636. const int64_t n_embd = hparams.convnext.n_embd;
  4637. for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
  4638. auto & layer = layers[i].convnext;
  4639. layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
  4640. layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
  4641. layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
  4642. layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
  4643. layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
  4644. layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
  4645. layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
  4646. layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
  4647. layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
  4648. }
  4649. // output
  4650. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4651. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4652. }
  4653. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
  4654. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
  4655. } break;
  4656. case LLM_ARCH_BAILINGMOE:
  4657. {
  4658. const int64_t n_ff_exp = hparams.n_ff_exp;
  4659. const int64_t n_expert_shared = hparams.n_expert_shared;
  4660. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4661. // output
  4662. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4663. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4664. for (int i = 0; i < n_layer; ++i) {
  4665. auto & layer = layers[i];
  4666. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4667. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
  4668. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  4669. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  4670. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
  4671. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4672. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4673. if (n_expert == 0) {
  4674. throw std::runtime_error("n_expert must be > 0");
  4675. }
  4676. if (n_expert_used == 0) {
  4677. throw std::runtime_error("n_expert_used must be > 0");
  4678. }
  4679. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4680. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  4681. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4682. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4683. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  4684. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4685. }
  4686. } break;
  4687. case LLM_ARCH_DOTS1:
  4688. {
  4689. const int64_t n_ff_exp = hparams.n_ff_exp;
  4690. const int64_t n_expert_shared = hparams.n_expert_shared;
  4691. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4692. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4693. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4694. for (int i = 0; i < n_layer; ++i) {
  4695. auto & layer = layers[i];
  4696. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4697. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4698. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4699. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4700. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4701. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4702. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4703. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4704. if (i < (int) hparams.n_layer_dense_lead) {
  4705. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4706. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4707. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4708. } else {
  4709. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4710. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  4711. if (n_expert == 0) {
  4712. throw std::runtime_error("n_expert must be > 0");
  4713. }
  4714. if (n_expert_used == 0) {
  4715. throw std::runtime_error("n_expert_used must be > 0");
  4716. }
  4717. // MoE branch
  4718. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4719. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  4720. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4721. // Shared expert branch
  4722. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4723. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  4724. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4725. }
  4726. }
  4727. } break;
  4728. case LLM_ARCH_ARCEE:
  4729. {
  4730. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4731. // output
  4732. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4733. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4734. // if output is NULL, init from the input tok embed
  4735. if (output == NULL) {
  4736. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4737. }
  4738. for (int i = 0; i < n_layer; ++i) {
  4739. auto & layer = layers[i];
  4740. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4741. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4742. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4743. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4744. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4745. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4746. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4747. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4748. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4749. }
  4750. } break;
  4751. case LLM_ARCH_ERNIE4_5:
  4752. case LLM_ARCH_ERNIE4_5_MOE:
  4753. {
  4754. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4755. // output
  4756. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4757. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4758. // if output is NULL, init from the input tok embed
  4759. if (output == NULL) {
  4760. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4761. }
  4762. for (int i = 0; i < n_layer; ++i) {
  4763. auto & layer = layers[i];
  4764. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4765. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4766. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  4767. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  4768. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4769. // optional bias tensors
  4770. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4771. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4772. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4773. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4774. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4775. if (arch == LLM_ARCH_ERNIE4_5_MOE && static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
  4776. int n_ff_exp = hparams.n_ff_exp;
  4777. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4778. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  4779. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
  4780. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
  4781. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  4782. // Shared expert (if present)
  4783. if (hparams.n_ff_shexp > 0) {
  4784. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
  4785. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd }, 0);
  4786. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
  4787. }
  4788. } else { // Dense layers
  4789. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4790. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4791. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4792. }
  4793. }
  4794. } break;
  4795. case LLM_ARCH_FALCON_H1:
  4796. {
  4797. // Common
  4798. const int64_t hidden_size = hparams.n_embd; // hidden_size
  4799. // mamba2 Mixer SSM params
  4800. const int64_t ssm_conv_kernel_size = hparams.ssm_d_conv; // ssm_conv_kernel_size
  4801. const int64_t ssm_n_groups = hparams.ssm_n_group; // ssm_n_groups
  4802. const int64_t ssm_state_size = hparams.ssm_d_state; // ssm_state_size
  4803. const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand
  4804. const int64_t ssm_num_heads = hparams.ssm_dt_rank; // ssm_num_heads
  4805. const int64_t ssm_conv_dim = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size;
  4806. const int64_t ssm_projection_size = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads;
  4807. // attn params
  4808. const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head
  4809. const int64_t attn_num_key_value_head = hparams.n_head_kv(0);
  4810. // ffn params
  4811. const int64_t ffn_intermediate_size = hparams.n_ff(0);
  4812. // embeddings
  4813. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0);
  4814. // output
  4815. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED);
  4816. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0);
  4817. // if output is NULL, init from the input tok embed
  4818. if (output == NULL) {
  4819. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED);
  4820. }
  4821. for (int i = 0; i < n_layer; ++i) {
  4822. auto & layer = layers[i];
  4823. /*SSM LAYERS*/
  4824. // ssm in
  4825. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0);
  4826. // ssm 1d conv
  4827. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0);
  4828. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED);
  4829. // ssm_dt
  4830. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0);
  4831. // no "weight" suffix for these
  4832. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0);
  4833. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0);
  4834. // ssm_norm
  4835. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED);
  4836. // out_proj
  4837. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0);
  4838. /*ATTENTION LAYERS*/
  4839. // attention layers (with optional bias)
  4840. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {hidden_size, n_embd_head_k * attn_num_attention_head}, 0);
  4841. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0);
  4842. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0);
  4843. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
  4844. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
  4845. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {attn_num_key_value_head * n_embd_head_k}, TENSOR_NOT_REQUIRED);
  4846. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {attn_num_key_value_head * n_embd_head_v}, TENSOR_NOT_REQUIRED);
  4847. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
  4848. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
  4849. // feed forward (w/ optional biases)
  4850. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0);
  4851. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4852. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
  4853. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_intermediate_size, hidden_size}, 0);
  4854. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
  4855. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
  4856. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
  4857. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
  4858. }
  4859. } break;
  4860. case LLM_ARCH_HUNYUAN_MOE:
  4861. {
  4862. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4863. // output
  4864. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4865. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4866. // if output is NULL, init from the input tok embed
  4867. if (output == NULL) {
  4868. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4869. }
  4870. for (int i = 0; i < n_layer; ++i) {
  4871. auto & layer = layers[i];
  4872. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4873. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4874. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4875. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4876. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4877. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4878. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4879. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4880. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4881. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  4882. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  4883. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  4884. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  4885. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  4886. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
  4887. }
  4888. } break;
  4889. case LLM_ARCH_HUNYUAN_DENSE:
  4890. {
  4891. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4892. // output
  4893. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4894. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4895. // if output is NULL, init from the input tok embed
  4896. if (output == NULL) {
  4897. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4898. }
  4899. for (int i = 0; i < n_layer; ++i) {
  4900. auto & layer = layers[i];
  4901. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4902. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4903. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4904. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4905. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4906. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4907. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4908. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4909. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4910. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4911. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4912. }
  4913. } break;
  4914. case LLM_ARCH_SMOLLM3:
  4915. {
  4916. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4917. // output
  4918. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4919. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4920. // if output is NULL, init from the input tok embed
  4921. if (output == NULL) {
  4922. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4923. }
  4924. for (int i = 0; i < n_layer; ++i) {
  4925. auto & layer = layers[i];
  4926. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4927. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4928. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4929. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4930. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4931. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4932. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4933. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4934. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4935. }
  4936. } break;
  4937. case LLM_ARCH_OPENAI_MOE:
  4938. {
  4939. const int64_t n_ff_exp = hparams.n_ff_exp;
  4940. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4941. // output
  4942. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4943. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4944. for (int i = 0; i < n_layer; ++i) {
  4945. auto & layer = layers[i];
  4946. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4947. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  4948. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
  4949. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  4950. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  4951. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
  4952. layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
  4953. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
  4954. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4955. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  4956. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4957. // bias
  4958. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_head * n_rot}, 0);
  4959. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_head_kv * n_rot}, 0);
  4960. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_head_kv * n_rot}, 0);
  4961. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  4962. layer.ffn_gate_inp_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0);
  4963. layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
  4964. layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), { n_embd, n_expert}, 0);
  4965. layer.ffn_up_exps_b = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
  4966. }
  4967. } break;
  4968. case LLM_ARCH_LFM2:
  4969. {
  4970. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4971. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  4972. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4973. if (output == NULL) {
  4974. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4975. }
  4976. for (int i = 0; i < n_layer; ++i) {
  4977. auto & layer = layers[i];
  4978. // ffn is same for transformer and conv layers
  4979. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4980. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4981. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4982. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4983. // for operator_norm
  4984. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4985. if (!hparams.is_recurrent(i)) {
  4986. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4987. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4988. GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
  4989. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  4990. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, hparams.n_embd_k_gqa(i)}, 0);
  4991. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, hparams.n_embd_v_gqa(i)}, 0);
  4992. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4993. } else {
  4994. layer.shortconv.conv = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV, "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0);
  4995. layer.shortconv.in_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ, "weight", i), {n_embd, 3 * n_embd}, 0);
  4996. layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
  4997. }
  4998. }
  4999. } break;
  5000. case LLM_ARCH_SMALLTHINKER:
  5001. {
  5002. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  5003. // output
  5004. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  5005. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  5006. // if output is NULL, init from the input tok embed
  5007. if (output == NULL) {
  5008. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  5009. }
  5010. for (int i = 0; i < n_layer; ++i) {
  5011. auto & layer = layers[i];
  5012. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  5013. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
  5014. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
  5015. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
  5016. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
  5017. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  5018. GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for SMALLTHINKER");
  5019. GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for SMALLTHINKER");
  5020. // MoE branch
  5021. const int64_t n_ff_exp = hparams.n_ff_exp;
  5022. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
  5023. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
  5024. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
  5025. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
  5026. }
  5027. } break;
  5028. default:
  5029. throw std::runtime_error("unknown architecture");
  5030. }
  5031. if (n_moved_tensors > 0) {
  5032. LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
  5033. __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
  5034. ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
  5035. }
  5036. }
  5037. ml.done_getting_tensors();
  5038. ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
  5039. pimpl->mappings.reserve(ml.mappings.size());
  5040. // create the backend buffers
  5041. std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
  5042. ctx_bufs.reserve(ctx_map.size());
  5043. // Ensure we have enough capacity for the maximum backend buffer we will potentially create
  5044. const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
  5045. pimpl->bufs.reserve(n_max_backend_buffer);
  5046. for (auto & it : ctx_map) {
  5047. ggml_backend_buffer_type_t buft = it.first;
  5048. ggml_context * ctx = it.second;
  5049. // skip contexts without tensors
  5050. if (ggml_get_first_tensor(ctx) == nullptr) {
  5051. continue;
  5052. }
  5053. llama_buf_map buf_map;
  5054. buf_map.reserve(n_max_backend_buffer);
  5055. // check if it is possible to use buffer_from_host_ptr with this buffer type
  5056. ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
  5057. if (!dev) {
  5058. // FIXME: workaround for CPU backend buft having a NULL device
  5059. dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  5060. if (!dev) {
  5061. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  5062. }
  5063. }
  5064. ggml_backend_dev_props props;
  5065. ggml_backend_dev_get_props(dev, &props);
  5066. bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
  5067. bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
  5068. if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
  5069. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  5070. // only the mmap region containing the tensors in the model is mapped to the backend buffer
  5071. // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
  5072. // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
  5073. void * addr = nullptr;
  5074. size_t first, last; // NOLINT
  5075. ml.get_mapping_range(&first, &last, &addr, idx, ctx);
  5076. if (first >= last) {
  5077. continue;
  5078. }
  5079. const size_t max_size = ggml_get_max_tensor_size(ctx);
  5080. ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
  5081. if (buf == nullptr) {
  5082. throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
  5083. }
  5084. pimpl->bufs.emplace_back(buf);
  5085. buf_map.emplace(idx, buf);
  5086. }
  5087. }
  5088. else {
  5089. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
  5090. if (buf == nullptr) {
  5091. throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
  5092. }
  5093. pimpl->bufs.emplace_back(buf);
  5094. if (use_mlock && ggml_backend_buffer_is_host(buf)) {
  5095. pimpl->mlock_bufs.emplace_back(new llama_mlock);
  5096. auto & mlock_buf = pimpl->mlock_bufs.back();
  5097. mlock_buf->init (ggml_backend_buffer_get_base(buf));
  5098. mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
  5099. }
  5100. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  5101. buf_map.emplace(idx, buf);
  5102. }
  5103. }
  5104. if (pimpl->bufs.empty()) {
  5105. throw std::runtime_error("failed to allocate buffer");
  5106. }
  5107. for (auto & buf : buf_map) {
  5108. // indicate that this buffer contains weights
  5109. // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
  5110. ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
  5111. }
  5112. ctx_bufs.emplace_back(ctx, buf_map);
  5113. }
  5114. if (llama_supports_gpu_offload()) {
  5115. const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
  5116. LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
  5117. if (n_gpu_layers > (int) hparams.n_layer) {
  5118. LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
  5119. }
  5120. const int max_backend_supported_layers = hparams.n_layer + 1;
  5121. const int max_offloadable_layers = hparams.n_layer + 1;
  5122. LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
  5123. }
  5124. // print memory requirements per buffer type
  5125. for (auto & buf : pimpl->bufs) {
  5126. LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
  5127. }
  5128. // populate tensors_by_name
  5129. for (auto & ctx : pimpl->ctxs) {
  5130. for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
  5131. tensors_by_name.emplace_back(ggml_get_name(cur), cur);
  5132. }
  5133. }
  5134. // load tensor data
  5135. for (auto & it : ctx_bufs) {
  5136. ggml_context * ctx = it.first;
  5137. auto & bufs = it.second;
  5138. if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
  5139. return false;
  5140. }
  5141. }
  5142. if (use_mmap_buffer) {
  5143. for (auto & mapping : ml.mappings) {
  5144. pimpl->mappings.emplace_back(std::move(mapping));
  5145. }
  5146. }
  5147. return true;
  5148. }
  5149. std::string llama_model::arch_name() const {
  5150. return llm_arch_name(arch);
  5151. }
  5152. std::string llama_model::type_name() const {
  5153. return llm_type_name(type);
  5154. }
  5155. std::string llama_model::desc() const {
  5156. return pimpl->desc_str;
  5157. }
  5158. size_t llama_model::size() const {
  5159. return pimpl->n_bytes;
  5160. }
  5161. size_t llama_model::n_tensors() const {
  5162. return tensors_by_name.size();
  5163. }
  5164. size_t llama_model::n_devices() const {
  5165. return devices.size();
  5166. }
  5167. uint64_t llama_model::n_elements() const {
  5168. return pimpl->n_elements;
  5169. }
  5170. void llama_model::print_info() const {
  5171. const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
  5172. auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
  5173. bool is_var = false;
  5174. std::vector<uint32_t> v;
  5175. for (uint32_t i = 0; i < n; ++i) {
  5176. v.push_back(f(i));
  5177. if (v[i] != v[0]) {
  5178. is_var = true;
  5179. }
  5180. }
  5181. std::stringstream ss;
  5182. if (is_var) {
  5183. ss << "[";
  5184. for (uint32_t i = 0; i < n; ++i) {
  5185. ss << v[i];
  5186. if (i < n - 1) {
  5187. ss << ", ";
  5188. }
  5189. }
  5190. ss << "]";
  5191. } else {
  5192. ss << v[0];
  5193. }
  5194. return ss.str();
  5195. };
  5196. // hparams
  5197. LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
  5198. LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
  5199. if (!hparams.vocab_only) {
  5200. LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
  5201. LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
  5202. LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
  5203. LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
  5204. LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
  5205. LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
  5206. LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
  5207. LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
  5208. LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
  5209. LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
  5210. LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
  5211. LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
  5212. LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
  5213. LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
  5214. LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
  5215. LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
  5216. LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
  5217. LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
  5218. LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
  5219. LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
  5220. LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
  5221. LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
  5222. LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
  5223. LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
  5224. LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
  5225. LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
  5226. LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
  5227. LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
  5228. LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
  5229. LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
  5230. if (!classifier_labels.empty()) {
  5231. LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
  5232. size_t i = 0;
  5233. for (auto label : classifier_labels) {
  5234. LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
  5235. }
  5236. }
  5237. }
  5238. if (arch == LLM_ARCH_MAMBA ||
  5239. arch == LLM_ARCH_MAMBA2 ||
  5240. arch == LLM_ARCH_JAMBA ||
  5241. arch == LLM_ARCH_FALCON_H1 ||
  5242. arch == LLM_ARCH_PLAMO2 ||
  5243. arch == LLM_ARCH_GRANITE_HYBRID ||
  5244. arch == LLM_ARCH_NEMOTRON_H ||
  5245. arch == LLM_ARCH_QWEN3NEXT) {
  5246. LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
  5247. LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
  5248. LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
  5249. LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
  5250. LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
  5251. LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
  5252. }
  5253. LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
  5254. if (pimpl->n_elements >= 1e12) {
  5255. LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
  5256. } else if (pimpl->n_elements >= 1e9) {
  5257. LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
  5258. } else if (pimpl->n_elements >= 1e6) {
  5259. LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
  5260. } else {
  5261. LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
  5262. }
  5263. // general kv
  5264. LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
  5265. if (arch == LLM_ARCH_DEEPSEEK) {
  5266. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  5267. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5268. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  5269. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  5270. }
  5271. if (arch == LLM_ARCH_DEEPSEEK2) {
  5272. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  5273. LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
  5274. LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
  5275. LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
  5276. LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
  5277. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5278. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  5279. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  5280. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  5281. LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
  5282. LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
  5283. }
  5284. if (arch == LLM_ARCH_QWEN2MOE) {
  5285. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5286. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  5287. }
  5288. if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE) {
  5289. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5290. }
  5291. if (arch == LLM_ARCH_MINICPM ||
  5292. arch == LLM_ARCH_GRANITE ||
  5293. arch == LLM_ARCH_GRANITE_MOE ||
  5294. arch == LLM_ARCH_GRANITE_HYBRID) {
  5295. LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
  5296. LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
  5297. LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
  5298. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  5299. }
  5300. if (arch == LLM_ARCH_BAILINGMOE) {
  5301. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  5302. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5303. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  5304. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  5305. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  5306. }
  5307. if (arch == LLM_ARCH_SMALLTHINKER) {
  5308. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5309. LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
  5310. }
  5311. vocab.print_info();
  5312. }
  5313. ggml_backend_dev_t llama_model::dev_layer(int il) const {
  5314. return pimpl->dev_layer.at(il).dev;
  5315. }
  5316. ggml_backend_dev_t llama_model::dev_output() const {
  5317. return pimpl->dev_output.dev;
  5318. }
  5319. template<typename F>
  5320. static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
  5321. ggml_init_params params = {
  5322. /*.mem_size =*/ ggml_tensor_overhead()*8,
  5323. /*.mem_buffer =*/ NULL,
  5324. /*.no_alloc =*/ true,
  5325. };
  5326. ggml_context_ptr ctx { ggml_init(params) };
  5327. if (!ctx) {
  5328. throw std::runtime_error(format("failed to create ggml context"));
  5329. }
  5330. ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
  5331. ggml_tensor * op_tensor = fn(ctx.get());
  5332. for (int i = 0; i < GGML_MAX_SRC; i++) {
  5333. if (op_tensor->src[i] != nullptr) {
  5334. assert(op_tensor->src[i]->buffer == nullptr);
  5335. op_tensor->src[i]->buffer = buf.get();
  5336. }
  5337. }
  5338. bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  5339. return op_supported;
  5340. }
  5341. template<typename F>
  5342. static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
  5343. for (const auto & cur : buft_list) {
  5344. ggml_backend_dev_t cur_dev = cur.first;
  5345. ggml_backend_buffer_type_t cur_buft = cur.second;
  5346. if (buft_supported(cur_buft, cur_dev, fn)) {
  5347. return cur_buft;
  5348. }
  5349. }
  5350. throw std::runtime_error(format("no suitable buffer type found"));
  5351. }
  5352. ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
  5353. return ::select_buft(
  5354. *pimpl->dev_layer.at(il).buft_list,
  5355. [&](ggml_context * ctx) {
  5356. ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  5357. ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  5358. return ggml_add(ctx, cur, layer_dir);
  5359. });
  5360. }
  5361. bool llama_model::has_tensor_overrides() const {
  5362. return pimpl->has_tensor_overrides;
  5363. }
  5364. const ggml_tensor * llama_model::get_tensor(const char * name) const {
  5365. auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
  5366. [name](const std::pair<std::string, ggml_tensor *> & it) {
  5367. return it.first == name;
  5368. });
  5369. if (it == tensors_by_name.end()) {
  5370. return nullptr;
  5371. }
  5372. return it->second;
  5373. }
  5374. float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
  5375. return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
  5376. }
  5377. float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
  5378. return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
  5379. }
  5380. ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
  5381. const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
  5382. // choose long/short freq factors based on the context size
  5383. if (layers[il].rope_freqs != nullptr) {
  5384. return layers[il].rope_freqs;
  5385. }
  5386. if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
  5387. return layers[il].rope_long;
  5388. }
  5389. return layers[il].rope_short;
  5390. }
  5391. llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
  5392. llama_memory_i * res;
  5393. switch (arch) {
  5394. // Models that need specific instantiation should be handled in the
  5395. // switch statement
  5396. case LLM_ARCH_BERT:
  5397. case LLM_ARCH_JINA_BERT_V2:
  5398. case LLM_ARCH_JINA_BERT_V3:
  5399. case LLM_ARCH_NOMIC_BERT:
  5400. case LLM_ARCH_NOMIC_BERT_MOE:
  5401. case LLM_ARCH_NEO_BERT:
  5402. case LLM_ARCH_WAVTOKENIZER_DEC:
  5403. //case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA]
  5404. case LLM_ARCH_DREAM:
  5405. case LLM_ARCH_LLADA:
  5406. case LLM_ARCH_LLADA_MOE:
  5407. {
  5408. res = nullptr;
  5409. } break;
  5410. // Models that need standard caching should rely on recurrent/hybrid
  5411. // checks
  5412. default:
  5413. {
  5414. if (llm_arch_is_recurrent(arch)) {
  5415. res = new llama_memory_recurrent(
  5416. *this,
  5417. GGML_TYPE_F32,
  5418. GGML_TYPE_F32,
  5419. cparams.offload_kqv,
  5420. std::max((uint32_t) 1, cparams.n_seq_max),
  5421. cparams.n_seq_max,
  5422. nullptr);
  5423. } else if (llm_arch_is_hybrid(arch)) {
  5424. // The main difference between hybrid architectures is the
  5425. // layer filters, so pick the right one here
  5426. llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
  5427. llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
  5428. if (arch == LLM_ARCH_FALCON_H1) {
  5429. filter_attn = [&](int32_t) { return true; };
  5430. filter_recr = [&](int32_t) { return true; };
  5431. } else if (arch == LLM_ARCH_NEMOTRON_H) {
  5432. filter_attn = [&](int32_t il) {
  5433. return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
  5434. };
  5435. filter_recr = [&](int32_t il) {
  5436. return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
  5437. };
  5438. }
  5439. const auto padding = llama_kv_cache::get_padding(cparams);
  5440. cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
  5441. res = new llama_memory_hybrid(
  5442. /* model */ *this,
  5443. /* attn_type_k */ params.type_k,
  5444. /* attn_type_v */ params.type_v,
  5445. /* attn_v_trans */ !cparams.flash_attn,
  5446. /* attn_kv_size */ cparams.n_ctx,
  5447. /* attn_n_pad */ padding,
  5448. /* attn_n_swa */ hparams.n_swa,
  5449. /* attn_swa_type */ hparams.swa_type,
  5450. /* recurrent_type_k */ GGML_TYPE_F32,
  5451. /* recurrent_type_v */ GGML_TYPE_F32,
  5452. /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
  5453. /* n_seq_max */ cparams.n_seq_max,
  5454. /* offload */ cparams.offload_kqv,
  5455. /* unified */ cparams.kv_unified,
  5456. /* filter_attn */ std::move(filter_attn),
  5457. /* filter_recr */ std::move(filter_recr));
  5458. } else {
  5459. const auto padding = llama_kv_cache::get_padding(cparams);
  5460. uint32_t n_ctx_per_stream = cparams.n_ctx;
  5461. if (!cparams.kv_unified) {
  5462. n_ctx_per_stream = (cparams.n_ctx + cparams.n_seq_max - 1)/cparams.n_seq_max;
  5463. n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
  5464. cparams.n_ctx = n_ctx_per_stream*cparams.n_seq_max;
  5465. } else {
  5466. n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
  5467. cparams.n_ctx = n_ctx_per_stream;
  5468. }
  5469. LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
  5470. llama_memory_i::layer_reuse_cb reuse = nullptr;
  5471. if (arch == LLM_ARCH_GEMMA3N) {
  5472. reuse = [&](int32_t il) {
  5473. if (il >= (int32_t) hparams.n_layer_kv_from_start) {
  5474. return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
  5475. }
  5476. return -1;
  5477. };
  5478. }
  5479. if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
  5480. GGML_ASSERT(hparams.is_swa_any());
  5481. res = new llama_kv_cache_iswa(
  5482. *this,
  5483. params.type_k,
  5484. params.type_v,
  5485. !cparams.flash_attn,
  5486. cparams.offload_kqv,
  5487. params.swa_full,
  5488. cparams.kv_unified,
  5489. n_ctx_per_stream,
  5490. cparams.n_seq_max,
  5491. cparams.n_ubatch,
  5492. padding,
  5493. nullptr,
  5494. reuse);
  5495. } else {
  5496. GGML_ASSERT(!hparams.is_swa_any());
  5497. res = new llama_kv_cache(
  5498. *this,
  5499. params.type_k,
  5500. params.type_v,
  5501. !cparams.flash_attn,
  5502. cparams.offload_kqv,
  5503. cparams.kv_unified,
  5504. n_ctx_per_stream,
  5505. cparams.n_seq_max,
  5506. padding,
  5507. hparams.n_swa,
  5508. hparams.swa_type,
  5509. nullptr,
  5510. nullptr);
  5511. }
  5512. }
  5513. }
  5514. }
  5515. return res;
  5516. }
  5517. ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
  5518. std::unique_ptr<llm_graph_context> llm;
  5519. switch (arch) {
  5520. case LLM_ARCH_LLAMA:
  5521. {
  5522. llm = std::make_unique<llm_build_llama>(*this, params);
  5523. } break;
  5524. case LLM_ARCH_LLAMA4:
  5525. {
  5526. if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
  5527. llm = std::make_unique<llm_build_llama>(*this, params);
  5528. } else {
  5529. llm = std::make_unique<llm_build_llama_iswa>(*this, params);
  5530. }
  5531. } break;
  5532. case LLM_ARCH_DECI:
  5533. {
  5534. llm = std::make_unique<llm_build_deci>(*this, params);
  5535. } break;
  5536. case LLM_ARCH_BAICHUAN:
  5537. {
  5538. llm = std::make_unique<llm_build_baichuan>(*this, params);
  5539. } break;
  5540. case LLM_ARCH_FALCON:
  5541. {
  5542. llm = std::make_unique<llm_build_falcon>(*this, params);
  5543. } break;
  5544. case LLM_ARCH_GROK:
  5545. {
  5546. llm = std::make_unique<llm_build_grok>(*this, params);
  5547. } break;
  5548. case LLM_ARCH_STARCODER:
  5549. {
  5550. llm = std::make_unique<llm_build_starcoder>(*this, params);
  5551. } break;
  5552. case LLM_ARCH_REFACT:
  5553. {
  5554. llm = std::make_unique<llm_build_refact>(*this, params);
  5555. } break;
  5556. case LLM_ARCH_BERT:
  5557. case LLM_ARCH_JINA_BERT_V2:
  5558. case LLM_ARCH_JINA_BERT_V3:
  5559. case LLM_ARCH_NOMIC_BERT:
  5560. case LLM_ARCH_NOMIC_BERT_MOE:
  5561. {
  5562. llm = std::make_unique<llm_build_bert>(*this, params);
  5563. } break;
  5564. case LLM_ARCH_NEO_BERT:
  5565. {
  5566. llm = std::make_unique<llm_build_neo_bert>(*this, params);
  5567. } break;
  5568. case LLM_ARCH_BLOOM:
  5569. {
  5570. llm = std::make_unique<llm_build_bloom>(*this, params);
  5571. } break;
  5572. case LLM_ARCH_MPT:
  5573. {
  5574. llm = std::make_unique<llm_build_mpt>(*this, params);
  5575. } break;
  5576. case LLM_ARCH_STABLELM:
  5577. {
  5578. llm = std::make_unique<llm_build_stablelm>(*this, params);
  5579. } break;
  5580. case LLM_ARCH_QWEN:
  5581. {
  5582. llm = std::make_unique<llm_build_qwen>(*this, params);
  5583. } break;
  5584. case LLM_ARCH_QWEN2:
  5585. {
  5586. llm = std::make_unique<llm_build_qwen2>(*this, params);
  5587. } break;
  5588. case LLM_ARCH_DREAM:
  5589. {
  5590. llm = std::make_unique<llm_build_dream>(*this, params);
  5591. }
  5592. break;
  5593. case LLM_ARCH_LLADA:
  5594. {
  5595. llm = std::make_unique<llm_build_llada>(*this, params);
  5596. }
  5597. break;
  5598. case LLM_ARCH_LLADA_MOE:
  5599. {
  5600. llm = std::make_unique<llm_build_llada_moe>(*this, params);
  5601. }
  5602. break;
  5603. case LLM_ARCH_QWEN2VL:
  5604. {
  5605. llm = std::make_unique<llm_build_qwen2vl>(*this, params);
  5606. } break;
  5607. case LLM_ARCH_QWEN2MOE:
  5608. {
  5609. llm = std::make_unique<llm_build_qwen2moe>(*this, params);
  5610. } break;
  5611. case LLM_ARCH_QWEN3:
  5612. {
  5613. llm = std::make_unique<llm_build_qwen3>(*this, params);
  5614. } break;
  5615. case LLM_ARCH_QWEN3MOE:
  5616. {
  5617. llm = std::make_unique<llm_build_qwen3moe>(*this, params);
  5618. } break;
  5619. case LLM_ARCH_PHI2:
  5620. {
  5621. llm = std::make_unique<llm_build_phi2>(*this, params);
  5622. } break;
  5623. case LLM_ARCH_PHI3:
  5624. case LLM_ARCH_PHIMOE:
  5625. {
  5626. if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
  5627. llm = std::make_unique<llm_build_phi3<true>> (*this, params);
  5628. } else {
  5629. llm = std::make_unique<llm_build_phi3<false>>(*this, params);
  5630. }
  5631. } break;
  5632. case LLM_ARCH_PLAMO:
  5633. {
  5634. llm = std::make_unique<llm_build_plamo>(*this, params);
  5635. } break;
  5636. case LLM_ARCH_PLAMO2:
  5637. {
  5638. llm = std::make_unique<llm_build_plamo2>(*this, params);
  5639. } break;
  5640. case LLM_ARCH_GPT2:
  5641. {
  5642. llm = std::make_unique<llm_build_gpt2>(*this, params);
  5643. } break;
  5644. case LLM_ARCH_CODESHELL:
  5645. {
  5646. llm = std::make_unique<llm_build_codeshell>(*this, params);
  5647. } break;
  5648. case LLM_ARCH_ORION:
  5649. {
  5650. llm = std::make_unique<llm_build_orion>(*this, params);
  5651. } break;
  5652. case LLM_ARCH_INTERNLM2:
  5653. {
  5654. llm = std::make_unique<llm_build_internlm2>(*this, params);
  5655. } break;
  5656. case LLM_ARCH_MINICPM3:
  5657. {
  5658. llm = std::make_unique<llm_build_minicpm3>(*this, params);
  5659. } break;
  5660. case LLM_ARCH_GEMMA:
  5661. {
  5662. llm = std::make_unique<llm_build_gemma>(*this, params);
  5663. } break;
  5664. case LLM_ARCH_GEMMA2:
  5665. {
  5666. llm = std::make_unique<llm_build_gemma2_iswa>(*this, params);
  5667. } break;
  5668. case LLM_ARCH_GEMMA3:
  5669. {
  5670. llm = std::make_unique<llm_build_gemma3_iswa>(*this, params);
  5671. } break;
  5672. case LLM_ARCH_GEMMA3N:
  5673. {
  5674. llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
  5675. } break;
  5676. case LLM_ARCH_GEMMA_EMBEDDING:
  5677. {
  5678. llm = std::make_unique<llm_build_gemma_embedding_iswa>(*this, params);
  5679. } break;
  5680. case LLM_ARCH_STARCODER2:
  5681. {
  5682. llm = std::make_unique<llm_build_starcoder2>(*this, params);
  5683. } break;
  5684. case LLM_ARCH_MAMBA:
  5685. case LLM_ARCH_MAMBA2:
  5686. {
  5687. llm = std::make_unique<llm_build_mamba>(*this, params);
  5688. } break;
  5689. case LLM_ARCH_JAMBA:
  5690. {
  5691. llm = std::make_unique<llm_build_jamba>(*this, params);
  5692. } break;
  5693. case LLM_ARCH_XVERSE:
  5694. {
  5695. llm = std::make_unique<llm_build_xverse>(*this, params);
  5696. } break;
  5697. case LLM_ARCH_COMMAND_R:
  5698. {
  5699. llm = std::make_unique<llm_build_command_r>(*this, params);
  5700. } break;
  5701. case LLM_ARCH_COHERE2:
  5702. {
  5703. llm = std::make_unique<llm_build_cohere2_iswa>(*this, params);
  5704. } break;
  5705. case LLM_ARCH_DBRX:
  5706. {
  5707. llm = std::make_unique<llm_build_dbrx>(*this, params);
  5708. } break;
  5709. case LLM_ARCH_OLMO:
  5710. {
  5711. llm = std::make_unique<llm_build_olmo>(*this, params);
  5712. } break;
  5713. case LLM_ARCH_OLMO2:
  5714. {
  5715. if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
  5716. llm = std::make_unique<llm_build_olmo2<true>>(*this, params);
  5717. } else {
  5718. llm = std::make_unique<llm_build_olmo2<false>>(*this, params);
  5719. }
  5720. } break;
  5721. case LLM_ARCH_OLMOE:
  5722. {
  5723. llm = std::make_unique<llm_build_olmoe>(*this, params);
  5724. } break;
  5725. case LLM_ARCH_OPENELM:
  5726. {
  5727. llm = std::make_unique<llm_build_openelm>(*this, params);
  5728. } break;
  5729. case LLM_ARCH_GPTNEOX:
  5730. {
  5731. llm = std::make_unique<llm_build_gptneox>(*this, params);
  5732. } break;
  5733. case LLM_ARCH_ARCTIC:
  5734. {
  5735. llm = std::make_unique<llm_build_arctic>(*this, params);
  5736. } break;
  5737. case LLM_ARCH_DEEPSEEK:
  5738. {
  5739. llm = std::make_unique<llm_build_deepseek>(*this, params);
  5740. } break;
  5741. case LLM_ARCH_DEEPSEEK2:
  5742. {
  5743. llm = std::make_unique<llm_build_deepseek2>(*this, params);
  5744. } break;
  5745. case LLM_ARCH_CHATGLM:
  5746. {
  5747. llm = std::make_unique<llm_build_chatglm>(*this, params);
  5748. } break;
  5749. case LLM_ARCH_GLM4:
  5750. {
  5751. llm = std::make_unique<llm_build_glm4>(*this, params);
  5752. } break;
  5753. case LLM_ARCH_GLM4_MOE:
  5754. {
  5755. llm = std::make_unique<llm_build_glm4_moe>(*this, params);
  5756. } break;
  5757. case LLM_ARCH_BITNET:
  5758. {
  5759. llm = std::make_unique<llm_build_bitnet>(*this, params);
  5760. } break;
  5761. case LLM_ARCH_T5:
  5762. {
  5763. switch (params.gtype) {
  5764. case LLM_GRAPH_TYPE_ENCODER:
  5765. llm = std::make_unique<llm_build_t5_enc>(*this, params);
  5766. break;
  5767. case LLM_GRAPH_TYPE_DEFAULT:
  5768. case LLM_GRAPH_TYPE_DECODER:
  5769. llm = std::make_unique<llm_build_t5_dec>(*this, params);
  5770. break;
  5771. default:
  5772. GGML_ABORT("invalid graph type");
  5773. };
  5774. } break;
  5775. case LLM_ARCH_T5ENCODER:
  5776. {
  5777. llm = std::make_unique<llm_build_t5_enc>(*this, params);
  5778. }
  5779. break;
  5780. case LLM_ARCH_JAIS:
  5781. {
  5782. llm = std::make_unique<llm_build_jais>(*this, params);
  5783. } break;
  5784. case LLM_ARCH_NEMOTRON:
  5785. {
  5786. llm = std::make_unique<llm_build_nemotron>(*this, params);
  5787. } break;
  5788. case LLM_ARCH_NEMOTRON_H:
  5789. {
  5790. llm = std::make_unique<llm_build_nemotron_h>(*this, params);
  5791. } break;
  5792. case LLM_ARCH_EXAONE:
  5793. {
  5794. llm = std::make_unique<llm_build_exaone>(*this, params);
  5795. } break;
  5796. case LLM_ARCH_EXAONE4:
  5797. {
  5798. if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
  5799. llm = std::make_unique<llm_build_exaone4<true>>(*this, params);
  5800. } else {
  5801. llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
  5802. }
  5803. } break;
  5804. case LLM_ARCH_RWKV6:
  5805. {
  5806. llm = std::make_unique<llm_build_rwkv6>(*this, params);
  5807. } break;
  5808. case LLM_ARCH_RWKV6QWEN2:
  5809. {
  5810. llm = std::make_unique<llm_build_rwkv6qwen2>(*this, params);
  5811. } break;
  5812. case LLM_ARCH_RWKV7:
  5813. {
  5814. llm = std::make_unique<llm_build_rwkv7>(*this, params);
  5815. } break;
  5816. case LLM_ARCH_ARWKV7:
  5817. {
  5818. llm = std::make_unique<llm_build_arwkv7>(*this, params);
  5819. } break;
  5820. case LLM_ARCH_GRANITE:
  5821. case LLM_ARCH_GRANITE_MOE:
  5822. case LLM_ARCH_MINICPM:
  5823. {
  5824. llm = std::make_unique<llm_build_granite>(*this, params);
  5825. } break;
  5826. case LLM_ARCH_GRANITE_HYBRID:
  5827. {
  5828. llm = std::make_unique<llm_build_granite_hybrid>(*this, params);
  5829. } break;
  5830. case LLM_ARCH_CHAMELEON:
  5831. {
  5832. llm = std::make_unique<llm_build_chameleon>(*this, params);
  5833. } break;
  5834. case LLM_ARCH_WAVTOKENIZER_DEC:
  5835. {
  5836. llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
  5837. } break;
  5838. case LLM_ARCH_PLM:
  5839. {
  5840. llm = std::make_unique<llm_build_plm>(*this, params);
  5841. } break;
  5842. case LLM_ARCH_BAILINGMOE:
  5843. {
  5844. llm = std::make_unique<llm_build_bailingmoe>(*this, params);
  5845. } break;
  5846. case LLM_ARCH_SEED_OSS:
  5847. {
  5848. llm = std::make_unique<llm_build_seed_oss>(*this, params);
  5849. } break;
  5850. case LLM_ARCH_DOTS1:
  5851. {
  5852. llm = std::make_unique<llm_build_dots1>(*this, params);
  5853. } break;
  5854. case LLM_ARCH_ARCEE:
  5855. {
  5856. llm = std::make_unique<llm_build_arcee>(*this, params);
  5857. } break;
  5858. case LLM_ARCH_ERNIE4_5:
  5859. {
  5860. llm = std::make_unique<llm_build_ernie4_5>(*this, params);
  5861. } break;
  5862. case LLM_ARCH_ERNIE4_5_MOE:
  5863. {
  5864. llm = std::make_unique<llm_build_ernie4_5_moe>(*this, params);
  5865. } break;
  5866. case LLM_ARCH_HUNYUAN_MOE:
  5867. {
  5868. llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
  5869. } break;
  5870. case LLM_ARCH_HUNYUAN_DENSE:
  5871. {
  5872. llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
  5873. } break;
  5874. case LLM_ARCH_SMOLLM3:
  5875. {
  5876. llm = std::make_unique<llm_build_smollm3>(*this, params);
  5877. } break;
  5878. case LLM_ARCH_OPENAI_MOE:
  5879. {
  5880. llm = std::make_unique<llm_build_openai_moe_iswa>(*this, params);
  5881. } break;
  5882. case LLM_ARCH_FALCON_H1:
  5883. {
  5884. llm = std::make_unique<llm_build_falcon_h1>(*this, params);
  5885. } break;
  5886. case LLM_ARCH_LFM2:
  5887. {
  5888. llm = std::make_unique<llm_build_lfm2>(*this, params);
  5889. } break;
  5890. case LLM_ARCH_SMALLTHINKER:
  5891. {
  5892. if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
  5893. llm = std::make_unique<llm_build_smallthinker<true>> (*this, params);
  5894. } else {
  5895. llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
  5896. }
  5897. } break;
  5898. case LLM_ARCH_QWEN3NEXT:
  5899. {
  5900. llm = std::make_unique<llm_build_qwen3next>(*this, params);
  5901. } break;
  5902. default:
  5903. GGML_ABORT("fatal error");
  5904. }
  5905. // add on pooling layer
  5906. llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
  5907. return llm->res->get_gf();
  5908. }
  5909. //
  5910. // interface implementation
  5911. //
  5912. llama_model_params llama_model_default_params() {
  5913. llama_model_params result = {
  5914. /*.devices =*/ nullptr,
  5915. /*.tensor_buft_overrides =*/ nullptr,
  5916. /*.n_gpu_layers =*/ 999,
  5917. /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
  5918. /*.main_gpu =*/ 0,
  5919. /*.tensor_split =*/ nullptr,
  5920. /*.progress_callback =*/ nullptr,
  5921. /*.progress_callback_user_data =*/ nullptr,
  5922. /*.kv_overrides =*/ nullptr,
  5923. /*.vocab_only =*/ false,
  5924. /*.use_mmap =*/ true,
  5925. /*.use_mlock =*/ false,
  5926. /*.check_tensors =*/ false,
  5927. /*.use_extra_bufts =*/ true,
  5928. };
  5929. return result;
  5930. }
  5931. const llama_vocab * llama_model_get_vocab(const llama_model * model) {
  5932. return &model->vocab;
  5933. }
  5934. void llama_free_model(llama_model * model) {
  5935. llama_model_free(model);
  5936. }
  5937. void llama_model_free(llama_model * model) {
  5938. delete model;
  5939. }
  5940. int32_t llama_model_n_ctx_train(const llama_model * model) {
  5941. return model->hparams.n_ctx_train;
  5942. }
  5943. int32_t llama_model_n_embd(const llama_model * model) {
  5944. return model->hparams.n_embd;
  5945. }
  5946. int32_t llama_model_n_layer(const llama_model * model) {
  5947. return model->hparams.n_layer;
  5948. }
  5949. int32_t llama_model_n_head(const llama_model * model) {
  5950. return model->hparams.n_head();
  5951. }
  5952. int32_t llama_model_n_head_kv(const llama_model * model) {
  5953. return model->hparams.n_head_kv();
  5954. }
  5955. int32_t llama_model_n_swa(const llama_model * model) {
  5956. return model->hparams.n_swa;
  5957. }
  5958. uint32_t llama_model_n_cls_out(const struct llama_model * model) {
  5959. return model->hparams.n_cls_out;
  5960. }
  5961. const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
  5962. if (i < model->classifier_labels.size()) {
  5963. return model->classifier_labels[i].c_str();
  5964. }
  5965. return nullptr;
  5966. }
  5967. // deprecated
  5968. int32_t llama_n_ctx_train(const llama_model * model) {
  5969. return llama_model_n_ctx_train(model);
  5970. }
  5971. // deprecated
  5972. int32_t llama_n_embd(const llama_model * model) {
  5973. return llama_model_n_embd(model);
  5974. }
  5975. // deprecated
  5976. int32_t llama_n_layer(const llama_model * model) {
  5977. return llama_model_n_layer(model);
  5978. }
  5979. // deprecated
  5980. int32_t llama_n_head(const llama_model * model) {
  5981. return llama_model_n_head(model);
  5982. }
  5983. llama_rope_type llama_model_rope_type(const llama_model * model) {
  5984. switch (model->arch) {
  5985. // these models do not use RoPE
  5986. case LLM_ARCH_GPT2:
  5987. case LLM_ARCH_GPTJ:
  5988. case LLM_ARCH_MPT:
  5989. case LLM_ARCH_REFACT:
  5990. case LLM_ARCH_BLOOM:
  5991. case LLM_ARCH_MAMBA:
  5992. case LLM_ARCH_MAMBA2:
  5993. case LLM_ARCH_JAMBA:
  5994. case LLM_ARCH_JINA_BERT_V2:
  5995. case LLM_ARCH_T5:
  5996. case LLM_ARCH_T5ENCODER:
  5997. case LLM_ARCH_JAIS:
  5998. case LLM_ARCH_RWKV6:
  5999. case LLM_ARCH_RWKV6QWEN2:
  6000. case LLM_ARCH_RWKV7:
  6001. case LLM_ARCH_ARWKV7:
  6002. case LLM_ARCH_WAVTOKENIZER_DEC:
  6003. case LLM_ARCH_NEMOTRON_H:
  6004. return LLAMA_ROPE_TYPE_NONE;
  6005. // use what we call a normal RoPE, operating on pairs of consecutive head values
  6006. case LLM_ARCH_LLAMA:
  6007. case LLM_ARCH_LLADA:
  6008. case LLM_ARCH_LLAMA4:
  6009. case LLM_ARCH_DECI:
  6010. case LLM_ARCH_BAICHUAN:
  6011. case LLM_ARCH_STARCODER:
  6012. case LLM_ARCH_INTERNLM2:
  6013. case LLM_ARCH_MINICPM:
  6014. case LLM_ARCH_XVERSE:
  6015. case LLM_ARCH_COMMAND_R:
  6016. case LLM_ARCH_COHERE2:
  6017. case LLM_ARCH_OLMO:
  6018. case LLM_ARCH_ARCTIC:
  6019. case LLM_ARCH_DEEPSEEK:
  6020. case LLM_ARCH_DEEPSEEK2:
  6021. case LLM_ARCH_PLM:
  6022. case LLM_ARCH_CHATGLM:
  6023. case LLM_ARCH_GLM4:
  6024. case LLM_ARCH_GRANITE:
  6025. case LLM_ARCH_GRANITE_MOE:
  6026. case LLM_ARCH_GRANITE_HYBRID:
  6027. case LLM_ARCH_CHAMELEON:
  6028. case LLM_ARCH_BAILINGMOE:
  6029. case LLM_ARCH_NEO_BERT:
  6030. case LLM_ARCH_SMOLLM3:
  6031. case LLM_ARCH_ARCEE:
  6032. case LLM_ARCH_ERNIE4_5:
  6033. case LLM_ARCH_ERNIE4_5_MOE:
  6034. case LLM_ARCH_QWEN3NEXT:
  6035. return LLAMA_ROPE_TYPE_NORM;
  6036. // the pairs of head values are offset by n_rot/2
  6037. case LLM_ARCH_FALCON:
  6038. case LLM_ARCH_FALCON_H1:
  6039. case LLM_ARCH_GROK:
  6040. case LLM_ARCH_DBRX:
  6041. case LLM_ARCH_BERT:
  6042. case LLM_ARCH_JINA_BERT_V3:
  6043. case LLM_ARCH_NOMIC_BERT:
  6044. case LLM_ARCH_NOMIC_BERT_MOE:
  6045. case LLM_ARCH_STABLELM:
  6046. case LLM_ARCH_BITNET:
  6047. case LLM_ARCH_QWEN:
  6048. case LLM_ARCH_QWEN2:
  6049. case LLM_ARCH_DREAM:
  6050. case LLM_ARCH_QWEN2MOE:
  6051. case LLM_ARCH_QWEN3:
  6052. case LLM_ARCH_QWEN3MOE:
  6053. case LLM_ARCH_LLADA_MOE:
  6054. case LLM_ARCH_OLMO2:
  6055. case LLM_ARCH_OLMOE:
  6056. case LLM_ARCH_PHI2:
  6057. case LLM_ARCH_PHI3:
  6058. case LLM_ARCH_PHIMOE:
  6059. case LLM_ARCH_PLAMO:
  6060. case LLM_ARCH_PLAMO2:
  6061. case LLM_ARCH_GEMMA:
  6062. case LLM_ARCH_GEMMA2:
  6063. case LLM_ARCH_GEMMA3:
  6064. case LLM_ARCH_GEMMA3N:
  6065. case LLM_ARCH_GEMMA_EMBEDDING:
  6066. case LLM_ARCH_STARCODER2:
  6067. case LLM_ARCH_OPENELM:
  6068. case LLM_ARCH_GPTNEOX:
  6069. case LLM_ARCH_CODESHELL:
  6070. case LLM_ARCH_ORION:
  6071. case LLM_ARCH_NEMOTRON:
  6072. case LLM_ARCH_EXAONE:
  6073. case LLM_ARCH_EXAONE4:
  6074. case LLM_ARCH_MINICPM3:
  6075. case LLM_ARCH_DOTS1:
  6076. case LLM_ARCH_HUNYUAN_MOE:
  6077. case LLM_ARCH_OPENAI_MOE:
  6078. case LLM_ARCH_HUNYUAN_DENSE:
  6079. case LLM_ARCH_LFM2:
  6080. case LLM_ARCH_SMALLTHINKER:
  6081. case LLM_ARCH_GLM4_MOE:
  6082. case LLM_ARCH_SEED_OSS:
  6083. return LLAMA_ROPE_TYPE_NEOX;
  6084. case LLM_ARCH_QWEN2VL:
  6085. return LLAMA_ROPE_TYPE_MROPE;
  6086. // all model arches should be listed explicitly here
  6087. case LLM_ARCH_UNKNOWN:
  6088. GGML_ABORT("unknown architecture");
  6089. }
  6090. return LLAMA_ROPE_TYPE_NONE;
  6091. }
  6092. float llama_model_rope_freq_scale_train(const llama_model * model) {
  6093. return model->hparams.rope_freq_scale_train;
  6094. }
  6095. int32_t llama_model_meta_val_str(const llama_model * model, const char * key, char * buf, size_t buf_size) {
  6096. const auto & it = model->gguf_kv.find(key);
  6097. if (it == model->gguf_kv.end()) {
  6098. if (buf_size > 0) {
  6099. buf[0] = '\0';
  6100. }
  6101. return -1;
  6102. }
  6103. return snprintf(buf, buf_size, "%s", it->second.c_str());
  6104. }
  6105. int32_t llama_model_meta_count(const llama_model * model) {
  6106. return (int)model->gguf_kv.size();
  6107. }
  6108. int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
  6109. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  6110. if (buf_size > 0) {
  6111. buf[0] = '\0';
  6112. }
  6113. return -1;
  6114. }
  6115. auto it = model->gguf_kv.begin();
  6116. std::advance(it, i);
  6117. return snprintf(buf, buf_size, "%s", it->first.c_str());
  6118. }
  6119. int32_t llama_model_meta_val_str_by_index(const llama_model * model, int32_t i, char * buf, size_t buf_size) {
  6120. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  6121. if (buf_size > 0) {
  6122. buf[0] = '\0';
  6123. }
  6124. return -1;
  6125. }
  6126. auto it = model->gguf_kv.begin();
  6127. std::advance(it, i);
  6128. return snprintf(buf, buf_size, "%s", it->second.c_str());
  6129. }
  6130. int32_t llama_model_desc(const llama_model * model, char * buf, size_t buf_size) {
  6131. return snprintf(buf, buf_size, "%s", model->desc().c_str());
  6132. }
  6133. uint64_t llama_model_size(const llama_model * model) {
  6134. return model->size();
  6135. }
  6136. const char * llama_model_chat_template(const llama_model * model, const char * name) {
  6137. const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
  6138. : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
  6139. const auto & it = model->gguf_kv.find(key);
  6140. if (it == model->gguf_kv.end()) {
  6141. // one-off fix for very popular models (so we are not flooded with issues)
  6142. // do not extend this list unless absolutely necessary
  6143. // Mistral-Small-2503 does not have built-in chat template
  6144. llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
  6145. if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
  6146. return "mistral-v7-tekken";
  6147. }
  6148. return nullptr;
  6149. }
  6150. return it->second.c_str();
  6151. }
  6152. uint64_t llama_model_n_params(const llama_model * model) {
  6153. return model->n_elements();
  6154. }
  6155. bool llama_model_has_encoder(const llama_model * model) {
  6156. switch (model->arch) {
  6157. case LLM_ARCH_T5: return true;
  6158. case LLM_ARCH_T5ENCODER: return true;
  6159. default: return false;
  6160. }
  6161. }
  6162. bool llama_model_has_decoder(const llama_model * model) {
  6163. switch (model->arch) {
  6164. case LLM_ARCH_T5ENCODER: return false;
  6165. default: return true;
  6166. }
  6167. }
  6168. llama_token llama_model_decoder_start_token(const llama_model * model) {
  6169. return model->hparams.dec_start_token_id;
  6170. }
  6171. bool llama_model_is_recurrent(const llama_model * model) {
  6172. return llm_arch_is_recurrent(model->arch);
  6173. }
  6174. bool llama_model_is_diffusion(const llama_model * model) {
  6175. return llm_arch_is_diffusion(model->arch);
  6176. }
  6177. const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
  6178. return model->tensors_by_name;
  6179. }