llama-model.cpp 443 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976
  1. #include "llama-model.h"
  2. #include "llama-impl.h"
  3. #include "llama-mmap.h"
  4. #include "llama-cparams.h"
  5. #include "llama-model-loader.h"
  6. #include "llama-kv-cache.h"
  7. #include "llama-kv-cache-iswa.h"
  8. #include "llama-memory-hybrid.h"
  9. #include "llama-memory-recurrent.h"
  10. #include "ggml-cpp.h"
  11. #include "models/models.h"
  12. #include <algorithm>
  13. #include <cassert>
  14. #include <cfloat>
  15. #include <cstring>
  16. #include <cmath>
  17. #include <functional>
  18. #include <map>
  19. #include <regex>
  20. #include <sstream>
  21. #include <stdexcept>
  22. const char * llm_type_name(llm_type type) {
  23. switch (type) {
  24. case LLM_TYPE_14M: return "14M";
  25. case LLM_TYPE_17M: return "17M";
  26. case LLM_TYPE_22M: return "22M";
  27. case LLM_TYPE_33M: return "33M";
  28. case LLM_TYPE_60M: return "60M";
  29. case LLM_TYPE_70M: return "70M";
  30. case LLM_TYPE_80M: return "80M";
  31. case LLM_TYPE_109M: return "109M";
  32. case LLM_TYPE_137M: return "137M";
  33. case LLM_TYPE_140M: return "140M";
  34. case LLM_TYPE_160M: return "160M";
  35. case LLM_TYPE_190M: return "190M";
  36. case LLM_TYPE_220M: return "220M";
  37. case LLM_TYPE_250M: return "250M";
  38. case LLM_TYPE_256M: return "256M";
  39. case LLM_TYPE_270M: return "270M";
  40. case LLM_TYPE_335M: return "335M";
  41. case LLM_TYPE_350M: return "350M";
  42. case LLM_TYPE_360M: return "360M";
  43. case LLM_TYPE_410M: return "410M";
  44. case LLM_TYPE_450M: return "450M";
  45. case LLM_TYPE_475M: return "475M";
  46. case LLM_TYPE_558M: return "558M";
  47. case LLM_TYPE_700M: return "700M";
  48. case LLM_TYPE_770M: return "770M";
  49. case LLM_TYPE_780M: return "780M";
  50. case LLM_TYPE_950M: return "950M";
  51. case LLM_TYPE_0_3B: return "0.3B";
  52. case LLM_TYPE_0_5B: return "0.5B";
  53. case LLM_TYPE_0_6B: return "0.6B";
  54. case LLM_TYPE_1B: return "1B";
  55. case LLM_TYPE_1_2B: return "1.2B";
  56. case LLM_TYPE_1_3B: return "1.3B";
  57. case LLM_TYPE_1_4B: return "1.4B";
  58. case LLM_TYPE_1_5B: return "1.5B";
  59. case LLM_TYPE_1_6B: return "1.6B";
  60. case LLM_TYPE_1_7B: return "1.7B";
  61. case LLM_TYPE_1_8B: return "1.8B";
  62. case LLM_TYPE_2B: return "2B";
  63. case LLM_TYPE_2_6B: return "2.6B";
  64. case LLM_TYPE_2_8B: return "2.8B";
  65. case LLM_TYPE_2_9B: return "2.9B";
  66. case LLM_TYPE_3B: return "3B";
  67. case LLM_TYPE_4B: return "4B";
  68. case LLM_TYPE_6B: return "6B";
  69. case LLM_TYPE_6_9B: return "6.9B";
  70. case LLM_TYPE_7B: return "7B";
  71. case LLM_TYPE_8B: return "8B";
  72. case LLM_TYPE_9B: return "9B";
  73. case LLM_TYPE_11B: return "11B";
  74. case LLM_TYPE_12B: return "12B";
  75. case LLM_TYPE_13B: return "13B";
  76. case LLM_TYPE_14B: return "14B";
  77. case LLM_TYPE_15B: return "15B";
  78. case LLM_TYPE_16B: return "16B";
  79. case LLM_TYPE_20B: return "20B";
  80. case LLM_TYPE_26B: return "26B";
  81. case LLM_TYPE_27B: return "27B";
  82. case LLM_TYPE_30B: return "30B";
  83. case LLM_TYPE_32B: return "32B";
  84. case LLM_TYPE_34B: return "34B";
  85. case LLM_TYPE_35B: return "35B";
  86. case LLM_TYPE_36B: return "36B";
  87. case LLM_TYPE_40B: return "40B";
  88. case LLM_TYPE_65B: return "65B";
  89. case LLM_TYPE_70B: return "70B";
  90. case LLM_TYPE_120B: return "120B";
  91. case LLM_TYPE_142B: return "142B";
  92. case LLM_TYPE_236B: return "236B";
  93. case LLM_TYPE_290B: return "290B";
  94. case LLM_TYPE_314B: return "314B";
  95. case LLM_TYPE_405B: return "405B";
  96. case LLM_TYPE_671B: return "671B";
  97. case LLM_TYPE_SMALL: return "0.1B";
  98. case LLM_TYPE_MEDIUM: return "0.4B";
  99. case LLM_TYPE_LARGE: return "0.8B";
  100. case LLM_TYPE_XL: return "1.5B";
  101. case LLM_TYPE_A1_7B: return "A1.7B";
  102. case LLM_TYPE_A2_7B: return "A2.7B";
  103. case LLM_TYPE_8x7B: return "8x7B";
  104. case LLM_TYPE_8x22B: return "8x22B";
  105. case LLM_TYPE_16x12B: return "16x12B";
  106. case LLM_TYPE_16x3_8B: return "16x3.8B";
  107. case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
  108. case LLM_TYPE_57B_A14B: return "57B.A14B";
  109. case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
  110. case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
  111. case LLM_TYPE_A13B: return "A13B";
  112. case LLM_TYPE_7B_A1B: return "7B.A1B";
  113. case LLM_TYPE_8B_A1B: return "8B.A1B";
  114. case LLM_TYPE_16B_A1B: return "16B.A1B";
  115. case LLM_TYPE_21B_A3B: return "21B.A3B";
  116. case LLM_TYPE_30B_A3B: return "30B.A3B";
  117. case LLM_TYPE_80B_A3B: return "80B.A3B";
  118. case LLM_TYPE_100B_A6B: return "100B.A6B";
  119. case LLM_TYPE_106B_A12B: return "106B.A12B";
  120. case LLM_TYPE_230B_A10B: return "230B.A10B";
  121. case LLM_TYPE_235B_A22B: return "235B.A22B";
  122. case LLM_TYPE_300B_A47B: return "300B.A47B";
  123. case LLM_TYPE_355B_A32B: return "355B.A32B";
  124. case LLM_TYPE_E2B: return "E2B";
  125. case LLM_TYPE_E4B: return "E4B";
  126. default: return "?B";
  127. }
  128. }
  129. static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
  130. switch (type) {
  131. case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
  132. case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
  133. default: return "unknown";
  134. }
  135. }
  136. static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
  137. { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
  138. { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
  139. { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
  140. { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
  141. };
  142. std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
  143. return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
  144. }
  145. static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
  146. for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
  147. if (kv.second == name) {
  148. return (llama_rope_scaling_type) kv.first;
  149. }
  150. }
  151. return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
  152. }
  153. // checks if the weight tensor can be used with the specified buffer type and device
  154. static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
  155. GGML_ASSERT(w != nullptr);
  156. if (op == GGML_OP_NONE) {
  157. return true;
  158. }
  159. ggml_init_params params = {
  160. /*.mem_size =*/ ggml_tensor_overhead()*8,
  161. /*.mem_buffer =*/ NULL,
  162. /*.no_alloc =*/ true,
  163. };
  164. ggml_context_ptr ctx_ptr { ggml_init(params) };
  165. if (!ctx_ptr) {
  166. throw std::runtime_error(format("failed to create ggml context"));
  167. }
  168. ggml_context * ctx = ctx_ptr.get();
  169. ggml_tensor * op_tensor = nullptr;
  170. switch (op) {
  171. case GGML_OP_GET_ROWS:
  172. {
  173. ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
  174. op_tensor = ggml_get_rows(ctx, w, b);
  175. } break;
  176. case GGML_OP_MUL_MAT:
  177. {
  178. ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
  179. op_tensor = ggml_mul_mat(ctx, w, b);
  180. } break;
  181. case GGML_OP_MUL_MAT_ID:
  182. {
  183. int n_expert_used = hparams.n_expert_used;
  184. ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
  185. ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
  186. op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
  187. } break;
  188. case GGML_OP_ADD:
  189. {
  190. ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
  191. op_tensor = ggml_add(ctx, a, w);
  192. } break;
  193. case GGML_OP_ADD_ID:
  194. {
  195. int n_expert_used = hparams.n_expert_used;
  196. ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
  197. ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
  198. op_tensor = ggml_add_id(ctx, a, w, c);
  199. } break;
  200. case GGML_OP_MUL:
  201. {
  202. ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
  203. op_tensor = ggml_mul(ctx, a, w);
  204. } break;
  205. case GGML_OP_DIV:
  206. {
  207. ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
  208. op_tensor = ggml_div(ctx, a, w);
  209. } break;
  210. case GGML_OP_ROPE:
  211. {
  212. int n_embd_head = hparams.n_embd_head_v;
  213. int n_head = hparams.n_head();
  214. ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
  215. ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
  216. op_tensor = ggml_rope_ext(
  217. ctx, a, b, w,
  218. 0, 0, 0, 0, 0,
  219. 0, 0, 0, 0
  220. );
  221. } break;
  222. case GGML_OP_SSM_CONV:
  223. {
  224. const int64_t n_seq_tokens = 512;
  225. const int64_t n_seqs = 3;
  226. ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
  227. op_tensor = ggml_ssm_conv(ctx, conv_x, w);
  228. } break;
  229. case GGML_OP_SSM_SCAN:
  230. {
  231. // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
  232. const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
  233. const int64_t n_head = w->ne[1];
  234. const int64_t head_dim = hparams.ssm_d_inner / n_head;
  235. const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
  236. const int64_t n_seq_tokens = 512;
  237. const int64_t n_seqs = 3;
  238. ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
  239. ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
  240. ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
  241. ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
  242. ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
  243. ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
  244. op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
  245. } break;
  246. case GGML_OP_RWKV_WKV6:
  247. {
  248. // FIXME
  249. const int64_t S = 123;
  250. const int64_t H = 123;
  251. const int64_t n_tokens = 123;
  252. const int64_t n_seqs = 123;
  253. ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  254. ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  255. ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  256. ggml_tensor * tf = w;
  257. ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  258. ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
  259. op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
  260. } break;
  261. case GGML_OP_IM2COL:
  262. {
  263. const int n_embd_inp = hparams.n_embd_inp();
  264. ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
  265. op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
  266. } break;
  267. case GGML_OP_SCALE:
  268. {
  269. op_tensor = ggml_scale(ctx, w, 1.0f);
  270. } break;
  271. default:
  272. GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
  273. }
  274. // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
  275. GGML_ASSERT(w->buffer == nullptr);
  276. w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
  277. bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  278. ggml_backend_buffer_free(w->buffer);
  279. w->buffer = nullptr;
  280. return op_supported;
  281. }
  282. // lists of buffer types used for each layer
  283. using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
  284. // find the first buffer type in the list that can use the tensor
  285. static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
  286. GGML_ASSERT(!buft_list.empty());
  287. for (const auto & cur : buft_list) {
  288. ggml_backend_dev_t cur_dev = cur.first;
  289. ggml_backend_buffer_type_t cur_buft = cur.second;
  290. if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
  291. return cur_buft;
  292. }
  293. }
  294. return nullptr;
  295. }
  296. // CPU: ACCEL -> GPU host -> CPU extra -> CPU
  297. static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts, bool no_host) {
  298. buft_list_t buft_list;
  299. // add ACCEL buffer types
  300. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  301. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  302. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
  303. auto * buft = ggml_backend_dev_buffer_type(dev);
  304. // skip
  305. if (buft != ggml_backend_cpu_buffer_type()) {
  306. buft_list.emplace_back(dev, buft);
  307. }
  308. }
  309. }
  310. // add a host buffer type
  311. // storing the tensors in a host buffer is useful when the processing of large batches
  312. // is offloaded to a GPU device, since it reduces the time spent on data transfers
  313. // generally, this will be done using the first device in the list
  314. // a better approach would be to handle this on a weight-by-weight basis using the offload_op
  315. // function of the device to determine if it would benefit from being stored in a host buffer
  316. if (!no_host) {
  317. for (auto * dev : devices) {
  318. ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
  319. if (buft) {
  320. buft_list.emplace_back(dev, buft);
  321. break;
  322. }
  323. }
  324. }
  325. // add extra buffer types
  326. if (use_extra_bufts) {
  327. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  328. if (cpu_dev == nullptr) {
  329. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  330. }
  331. auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
  332. auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
  333. ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
  334. if (ggml_backend_dev_get_extra_bufts_fn) {
  335. ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
  336. while (extra_bufts && *extra_bufts) {
  337. buft_list.emplace_back(cpu_dev, *extra_bufts);
  338. ++extra_bufts;
  339. }
  340. }
  341. }
  342. // add the CPU buffer type
  343. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  344. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  345. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
  346. buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
  347. }
  348. }
  349. return buft_list;
  350. }
  351. // GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
  352. static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode split_mode, const float * tensor_split) {
  353. buft_list_t buft_list;
  354. // add the device split buffer type if requested and available
  355. if (split_mode == LLAMA_SPLIT_MODE_ROW) {
  356. ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
  357. auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
  358. ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
  359. if (ggml_backend_split_buffer_type_fn) {
  360. size_t dev_index = [&]() {
  361. auto * reg = ggml_backend_dev_backend_reg(dev);
  362. for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
  363. if (ggml_backend_reg_dev_get(reg, i) == dev) {
  364. return i;
  365. }
  366. }
  367. throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
  368. }();
  369. auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
  370. if (buft != nullptr) {
  371. buft_list.emplace_back(dev, buft);
  372. }
  373. }
  374. }
  375. // add the device default buffer type
  376. buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
  377. // add the device extra buffer type (if any)
  378. ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
  379. auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
  380. ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts");
  381. if (ggml_backend_dev_get_extra_bufts_fn) {
  382. ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev);
  383. while (extra_bufts && *extra_bufts) {
  384. buft_list.emplace_back(dev, *extra_bufts);
  385. ++extra_bufts;
  386. }
  387. }
  388. return buft_list;
  389. }
  390. struct llama_model::impl {
  391. impl() = default;
  392. ~impl() = default;
  393. uint64_t n_elements = 0;
  394. size_t n_bytes = 0;
  395. std::string desc_str;
  396. // model memory mapped files
  397. llama_mmaps mappings;
  398. // objects representing data potentially being locked in memory
  399. llama_mlocks mlock_bufs;
  400. llama_mlocks mlock_mmaps;
  401. // contexts where the model tensors metadata is stored as well ass the corresponding buffers:
  402. std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
  403. buft_list_t cpu_buft_list;
  404. std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
  405. struct layer_dev {
  406. ggml_backend_dev_t dev;
  407. buft_list_t * buft_list;
  408. };
  409. layer_dev dev_input = {};
  410. layer_dev dev_output = {};
  411. std::vector<layer_dev> dev_layer;
  412. bool has_tensor_overrides;
  413. };
  414. llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
  415. pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
  416. }
  417. llama_model::~llama_model() = default;
  418. void llama_model::load_stats(llama_model_loader & ml) {
  419. pimpl->n_elements = ml.n_elements;
  420. pimpl->n_bytes = ml.n_bytes;
  421. }
  422. void llama_model::load_arch(llama_model_loader & ml) {
  423. arch = ml.get_arch();
  424. if (arch == LLM_ARCH_UNKNOWN) {
  425. throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
  426. }
  427. }
  428. void llama_model::load_hparams(llama_model_loader & ml) {
  429. const gguf_context * ctx = ml.meta.get();
  430. // get metadata as string
  431. for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
  432. gguf_type type = gguf_get_kv_type(ctx, i);
  433. if (type == GGUF_TYPE_ARRAY) {
  434. continue;
  435. }
  436. const char * name = gguf_get_key(ctx, i);
  437. const std::string value = gguf_kv_to_str(ctx, i);
  438. gguf_kv.emplace(name, value);
  439. }
  440. // get general kv
  441. ml.get_key(LLM_KV_GENERAL_NAME, name, false);
  442. // everything past this point is not vocab-related
  443. // for CLIP models, we only need to load tensors, no hparams
  444. if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) {
  445. return;
  446. }
  447. ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
  448. ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
  449. ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
  450. ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
  451. ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
  452. ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false);
  453. ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false);
  454. if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
  455. ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
  456. ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
  457. ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
  458. ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
  459. ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
  460. }
  461. GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
  462. GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
  463. if (hparams.n_expert > 0) {
  464. GGML_ASSERT(hparams.n_expert_used > 0);
  465. GGML_ASSERT(hparams.n_expert_groups < hparams.n_expert);
  466. if (hparams.n_expert_groups > 1) {
  467. GGML_ASSERT(hparams.n_expert % hparams.n_expert_groups == 0);
  468. GGML_ASSERT(hparams.n_group_used > 0);
  469. GGML_ASSERT(hparams.n_group_used < hparams.n_expert_groups);
  470. }
  471. } else {
  472. GGML_ASSERT(hparams.n_expert_used == 0);
  473. GGML_ASSERT(hparams.n_expert_groups == 0);
  474. }
  475. std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
  476. std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
  477. std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
  478. std::fill(
  479. hparams.recurrent_layer_arr.begin(),
  480. hparams.recurrent_layer_arr.end(),
  481. llm_arch_is_recurrent(ml.get_arch()));
  482. std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
  483. std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
  484. std::fill(hparams.xielu_alpha_n.begin(), hparams.xielu_alpha_n.end(), 0.0f);
  485. std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f);
  486. std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f);
  487. std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f);
  488. ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
  489. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
  490. // n_head_kv is optional, default to n_head
  491. hparams.n_head_kv_arr = hparams.n_head_arr;
  492. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
  493. bool rope_finetuned = false;
  494. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  495. hparams.rope_finetuned = rope_finetuned;
  496. hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
  497. ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
  498. // rope_freq_base (optional)
  499. hparams.rope_freq_base_train = 10000.0f;
  500. ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
  501. std::string rope_scaling("linear");
  502. ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
  503. hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
  504. GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
  505. // rope_freq_scale (inverse of the kv) is optional
  506. float ropescale = 0.0f;
  507. if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
  508. // try the old key name
  509. ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
  510. }
  511. hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
  512. // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
  513. hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
  514. hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
  515. ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
  516. // non-transformer models do not have attention heads
  517. if (hparams.n_head() > 0) {
  518. // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
  519. // gpt-j n_rot = rotary_dim
  520. hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
  521. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
  522. hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
  523. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
  524. // sanity check for n_rot (optional)
  525. hparams.n_rot = hparams.n_embd_head_k;
  526. ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
  527. if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
  528. if (hparams.n_rot != hparams.n_embd_head_k) {
  529. throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
  530. }
  531. }
  532. } else {
  533. hparams.n_rot = 0;
  534. hparams.n_embd_head_k = 0;
  535. hparams.n_embd_head_v = 0;
  536. }
  537. // for differentiating model types
  538. uint32_t n_vocab = 0;
  539. ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
  540. // for classifier models
  541. ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
  542. if (!classifier_labels.empty()) {
  543. hparams.n_cls_out = classifier_labels.size();
  544. }
  545. // arch-specific KVs
  546. switch (arch) {
  547. case LLM_ARCH_LLAMA:
  548. {
  549. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  550. if (hparams.n_expert == 8) {
  551. switch (hparams.n_layer) {
  552. case 32: type = LLM_TYPE_8x7B; break;
  553. case 56: type = LLM_TYPE_8x22B; break;
  554. default: type = LLM_TYPE_UNKNOWN;
  555. }
  556. } else {
  557. switch (hparams.n_layer) {
  558. case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
  559. case 22: type = LLM_TYPE_1B; break;
  560. case 26: type = LLM_TYPE_3B; break;
  561. case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
  562. case 30: type = LLM_TYPE_256M; break; // smoldocling 256M
  563. // granite uses a vocab with len 49152
  564. case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
  565. case 36: type = LLM_TYPE_8B; break; // granite
  566. case 40: type = LLM_TYPE_13B; break;
  567. case 48: type = LLM_TYPE_34B; break;
  568. case 60: type = LLM_TYPE_30B; break;
  569. case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
  570. default: type = LLM_TYPE_UNKNOWN;
  571. }
  572. }
  573. } break;
  574. case LLM_ARCH_LLAMA4:
  575. {
  576. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  577. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  578. ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
  579. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  580. if (found_swa && hparams.n_swa == 0) {
  581. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  582. hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
  583. } else {
  584. hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
  585. hparams.n_swa = 8192;
  586. hparams.n_attn_temp_floor_scale = 8192;
  587. hparams.f_attn_temp_scale = 0.1f;
  588. hparams.f_attn_temp_offset = 1.0f;
  589. hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
  590. }
  591. switch (hparams.n_expert) {
  592. case 0: {
  593. // MobileLLM (no MoE)
  594. switch (hparams.n_embd) {
  595. case 2048: type = LLM_TYPE_140M; break;
  596. case 4096: type = LLM_TYPE_360M; break;
  597. case 6144: type = LLM_TYPE_950M; break;
  598. default: type = LLM_TYPE_UNKNOWN;
  599. }
  600. } break;
  601. case 16: type = LLM_TYPE_17B_16E; break;
  602. case 128: type = LLM_TYPE_17B_128E; break;
  603. default: type = LLM_TYPE_UNKNOWN;
  604. }
  605. hparams.use_kq_norm = type != LLM_TYPE_17B_128E;
  606. } break;
  607. case LLM_ARCH_ARCEE:
  608. {
  609. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  610. // Arcee uses the same structure as Llama
  611. switch (hparams.n_layer) {
  612. case 36: type = LLM_TYPE_4B; break;
  613. default: type = LLM_TYPE_UNKNOWN;
  614. }
  615. } break;
  616. case LLM_ARCH_AFMOE:
  617. {
  618. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  619. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  620. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  621. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  622. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  623. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
  624. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  625. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  626. // Set up interleaved sliding window attention (ISWA)
  627. // Pattern: 3 sliding - 1 full (global_attn_every_n_layers = 4)
  628. if (hparams.n_swa > 0) {
  629. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  630. hparams.set_swa_pattern(4);
  631. } else {
  632. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  633. }
  634. // Default to sigmoid if not set
  635. if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
  636. hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
  637. }
  638. switch (hparams.n_layer) {
  639. case 56: type = LLM_TYPE_6B; break;
  640. case 32: type = LLM_TYPE_26B; break;
  641. default: type = LLM_TYPE_UNKNOWN;
  642. }
  643. } break;
  644. case LLM_ARCH_DECI:
  645. {
  646. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  647. switch (hparams.n_layer) {
  648. case 32: type = LLM_TYPE_7B; break;
  649. case 80: type = LLM_TYPE_70B; break;
  650. case 162: type = LLM_TYPE_405B; break;
  651. default: type = LLM_TYPE_UNKNOWN;
  652. }
  653. } break;
  654. case LLM_ARCH_MINICPM:
  655. {
  656. // Backward-compatible defaults for older MiniCPM GGUFs
  657. hparams.f_embedding_scale = 12.0f;
  658. hparams.f_residual_scale = 1.4f / sqrtf(float(hparams.n_layer));
  659. hparams.f_logit_scale = hparams.n_embd ? (256.0f / float(hparams.n_embd)) : 1.0f;
  660. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  661. // Optional KV reads, override defaults if present in newer GGUF exports
  662. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /*required=*/false);
  663. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /*required=*/false);
  664. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /*required=*/false);
  665. // MiniCPM uses rope by default, unlike Granite which uses it as a switch
  666. hparams.rope_finetuned = true;
  667. switch (hparams.n_layer) {
  668. case 52: type = LLM_TYPE_1B; break;
  669. case 40: type = LLM_TYPE_2B; break;
  670. default: type = LLM_TYPE_UNKNOWN;
  671. }
  672. } break;
  673. case LLM_ARCH_MINICPM3:
  674. {
  675. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  676. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  677. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  678. switch (hparams.n_layer) {
  679. case 62: type = LLM_TYPE_4B; break;
  680. default: type = LLM_TYPE_UNKNOWN;
  681. }
  682. } break;
  683. case LLM_ARCH_GROK:
  684. {
  685. // defaults for old GGUFs
  686. hparams.yarn_beta_fast = 8.0f;
  687. hparams.f_logit_scale = 0.5773502691896257f;
  688. hparams.f_embedding_scale = 78.38367176906169f;
  689. hparams.f_attn_out_scale = 0.08838834764831845f;
  690. hparams.f_attn_logit_softcapping = 30.0f;
  691. hparams.f_router_logit_softcapping = 30.0f;
  692. // no final_logit_softcapping in grok-1
  693. hparams.f_final_logit_softcapping = 0.0f;
  694. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  695. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  696. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false);
  697. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false);
  698. ml.get_key(LLM_KV_ATTENTION_OUTPUT_SCALE, hparams.f_attn_out_scale, false);
  699. ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
  700. ml.get_key(LLM_KV_ROUTER_LOGIT_SOFTCAPPING, hparams.f_router_logit_softcapping, false);
  701. ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
  702. ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.attn_temp_length, false);
  703. ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor, false);
  704. ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
  705. ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
  706. ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
  707. switch (hparams.n_layer) {
  708. case 64: type = LLM_TYPE_314B; break;
  709. default: type = LLM_TYPE_UNKNOWN;
  710. }
  711. } break;
  712. case LLM_ARCH_FALCON:
  713. {
  714. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  715. switch (hparams.n_layer) {
  716. case 32: type = LLM_TYPE_7B; break;
  717. case 60: type = LLM_TYPE_40B; break;
  718. default: type = LLM_TYPE_UNKNOWN;
  719. }
  720. } break;
  721. case LLM_ARCH_BAICHUAN:
  722. {
  723. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  724. switch (hparams.n_layer) {
  725. case 32: type = LLM_TYPE_7B; break;
  726. case 40: type = LLM_TYPE_13B; break;
  727. default: type = LLM_TYPE_UNKNOWN;
  728. }
  729. if (type == LLM_TYPE_13B) {
  730. // TODO: become GGUF KV parameter
  731. hparams.f_max_alibi_bias = 8.0f;
  732. }
  733. } break;
  734. case LLM_ARCH_STARCODER:
  735. {
  736. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  737. switch (hparams.n_layer) {
  738. case 24: type = LLM_TYPE_1B; break;
  739. case 36: type = LLM_TYPE_3B; break;
  740. case 42: type = LLM_TYPE_7B; break;
  741. case 40: type = LLM_TYPE_15B; break;
  742. default: type = LLM_TYPE_UNKNOWN;
  743. }
  744. } break;
  745. case LLM_ARCH_REFACT:
  746. {
  747. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  748. switch (hparams.n_layer) {
  749. case 32: type = LLM_TYPE_1B; break;
  750. default: type = LLM_TYPE_UNKNOWN;
  751. }
  752. // TODO: become GGUF KV parameter
  753. hparams.f_max_alibi_bias = 8.0f;
  754. } break;
  755. case LLM_ARCH_BERT:
  756. {
  757. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  758. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  759. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  760. switch (hparams.n_layer) {
  761. case 3:
  762. type = LLM_TYPE_17M; break; // bge-micro
  763. case 6:
  764. type = LLM_TYPE_22M; break; // MiniLM-L6
  765. case 12:
  766. switch (hparams.n_embd) {
  767. case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
  768. case 768: type = LLM_TYPE_109M; break; // bge-base
  769. default: type = LLM_TYPE_UNKNOWN;
  770. } break;
  771. case 24:
  772. type = LLM_TYPE_335M; break; // bge-large
  773. default: type = LLM_TYPE_UNKNOWN;
  774. }
  775. } break;
  776. case LLM_ARCH_JINA_BERT_V2:
  777. {
  778. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  779. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  780. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  781. hparams.f_max_alibi_bias = 8.0f;
  782. switch (hparams.n_layer) {
  783. case 4: type = LLM_TYPE_33M; break; // jina-embeddings-small
  784. case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
  785. default: type = LLM_TYPE_UNKNOWN;
  786. }
  787. } break;
  788. case LLM_ARCH_JINA_BERT_V3:
  789. {
  790. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  791. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  792. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  793. switch (hparams.n_layer) {
  794. case 24:
  795. type = LLM_TYPE_558M; break;
  796. default: type = LLM_TYPE_UNKNOWN;
  797. }
  798. } break;
  799. case LLM_ARCH_NOMIC_BERT:
  800. case LLM_ARCH_NOMIC_BERT_MOE:
  801. {
  802. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  803. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  804. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  805. ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
  806. if (hparams.n_layer == 12 && hparams.n_embd == 768) {
  807. if (arch == LLM_ARCH_NOMIC_BERT) {
  808. type = LLM_TYPE_137M;
  809. } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
  810. type = LLM_TYPE_475M;
  811. }
  812. }
  813. } break;
  814. case LLM_ARCH_NEO_BERT:
  815. {
  816. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  817. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  818. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  819. if (hparams.n_layer == 28) {
  820. type = LLM_TYPE_250M;
  821. }
  822. } break;
  823. case LLM_ARCH_BLOOM:
  824. {
  825. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  826. switch (hparams.n_layer) {
  827. case 24: type = LLM_TYPE_1B; break;
  828. case 30:
  829. switch (hparams.n_embd) {
  830. case 2560: type = LLM_TYPE_3B; break;
  831. case 4096: type = LLM_TYPE_7B; break;
  832. default: type = LLM_TYPE_UNKNOWN;
  833. } break;
  834. default: type = LLM_TYPE_UNKNOWN;
  835. }
  836. // TODO: become GGUF KV parameter
  837. hparams.f_max_alibi_bias = 8.0f;
  838. } break;
  839. case LLM_ARCH_MPT:
  840. {
  841. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  842. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  843. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  844. switch (hparams.n_layer) {
  845. case 32: type = LLM_TYPE_7B; break;
  846. case 48: type = LLM_TYPE_30B; break;
  847. default: type = LLM_TYPE_UNKNOWN;
  848. }
  849. } break;
  850. case LLM_ARCH_STABLELM:
  851. {
  852. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  853. switch (hparams.n_layer) {
  854. case 24: type = LLM_TYPE_1B; break;
  855. case 32: type = LLM_TYPE_3B; break;
  856. case 40: type = LLM_TYPE_12B; break;
  857. default: type = LLM_TYPE_UNKNOWN;
  858. }
  859. } break;
  860. case LLM_ARCH_QWEN:
  861. {
  862. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  863. switch (hparams.n_layer) {
  864. case 32: type = LLM_TYPE_7B; break;
  865. case 40: type = LLM_TYPE_13B; break;
  866. default: type = LLM_TYPE_UNKNOWN;
  867. }
  868. } break;
  869. case LLM_ARCH_QWEN2VL:
  870. {
  871. ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
  872. }
  873. // fall through
  874. case LLM_ARCH_QWEN2:
  875. {
  876. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  877. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  878. switch (hparams.n_layer) {
  879. case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
  880. case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
  881. case 32: type = LLM_TYPE_7B; break;
  882. case 36: type = LLM_TYPE_3B; break;
  883. case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
  884. case 48: type = LLM_TYPE_14B; break;
  885. case 64: type = LLM_TYPE_32B; break;
  886. case 80: type = LLM_TYPE_70B; break;
  887. default: type = LLM_TYPE_UNKNOWN;
  888. }
  889. } break;
  890. case LLM_ARCH_DREAM:
  891. {
  892. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  893. // Dream models are primarily 7B with 28 layers
  894. switch (hparams.n_layer) {
  895. case 28:
  896. type = LLM_TYPE_7B;
  897. break;
  898. default:
  899. type = LLM_TYPE_UNKNOWN;
  900. }
  901. // Set non-causal attention for diffusion models
  902. hparams.causal_attn = false;
  903. }
  904. break;
  905. case LLM_ARCH_LLADA:
  906. {
  907. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  908. // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
  909. switch (hparams.n_layer) {
  910. case 32:
  911. type = LLM_TYPE_8B;
  912. break;
  913. default:
  914. type = LLM_TYPE_UNKNOWN;
  915. }
  916. // Set non-causal attention for diffusion models
  917. hparams.causal_attn = false;
  918. }
  919. break;
  920. case LLM_ARCH_LLADA_MOE:
  921. {
  922. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  923. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  924. // diffusion language model uses non-causal attention
  925. hparams.causal_attn = false;
  926. switch (hparams.n_layer) {
  927. case 16: type = LLM_TYPE_A1_7B; break;
  928. default: type = LLM_TYPE_UNKNOWN;
  929. }
  930. } break;
  931. case LLM_ARCH_RND1:
  932. {
  933. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  934. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  935. switch (hparams.n_layer) {
  936. case 48: type = LLM_TYPE_30B_A3B; break;
  937. default: type = LLM_TYPE_UNKNOWN;
  938. }
  939. // Set non-causal attention for diffusion models
  940. hparams.causal_attn = false;
  941. } break;
  942. case LLM_ARCH_QWEN2MOE:
  943. {
  944. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  945. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  946. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  947. switch (hparams.n_layer) {
  948. case 24: type = LLM_TYPE_A2_7B; break;
  949. case 28: type = LLM_TYPE_57B_A14B; break;
  950. default: type = LLM_TYPE_UNKNOWN;
  951. }
  952. } break;
  953. case LLM_ARCH_QWEN3:
  954. {
  955. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  956. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  957. switch (hparams.n_layer) {
  958. case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
  959. case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
  960. case 40: type = LLM_TYPE_14B; break;
  961. case 64: type = LLM_TYPE_32B; break;
  962. default: type = LLM_TYPE_UNKNOWN;
  963. }
  964. } break;
  965. case LLM_ARCH_QWEN3VL:
  966. {
  967. ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
  968. ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
  969. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  970. switch (hparams.n_layer) {
  971. case 28: type = LLM_TYPE_1_7B; break;
  972. case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
  973. case 64: type = LLM_TYPE_32B; break;
  974. default: type = LLM_TYPE_UNKNOWN;
  975. }
  976. } break;
  977. case LLM_ARCH_QWEN3MOE:
  978. {
  979. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  980. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  981. switch (hparams.n_layer) {
  982. case 48: type = LLM_TYPE_30B_A3B; break;
  983. case 94: type = LLM_TYPE_235B_A22B; break;
  984. default: type = LLM_TYPE_UNKNOWN;
  985. }
  986. } break;
  987. case LLM_ARCH_QWEN3VLMOE:
  988. {
  989. ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
  990. ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
  991. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  992. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  993. switch (hparams.n_layer) {
  994. case 48: type = LLM_TYPE_30B_A3B; break;
  995. case 94: type = LLM_TYPE_235B_A22B; break;
  996. default: type = LLM_TYPE_UNKNOWN;
  997. }
  998. } break;
  999. case LLM_ARCH_PHI2:
  1000. {
  1001. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1002. switch (hparams.n_layer) {
  1003. case 24: type = LLM_TYPE_1B; break;
  1004. case 32: type = LLM_TYPE_3B; break;
  1005. default: type = LLM_TYPE_UNKNOWN;
  1006. }
  1007. } break;
  1008. case LLM_ARCH_PHI3:
  1009. {
  1010. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1011. switch (hparams.n_layer) {
  1012. case 24: type = LLM_TYPE_1B; break;
  1013. case 32: type = LLM_TYPE_3B; break;
  1014. case 40: type = LLM_TYPE_14B; break;
  1015. default: type = LLM_TYPE_UNKNOWN;
  1016. }
  1017. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1018. if (found_swa && hparams.n_swa > 0) {
  1019. LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
  1020. __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
  1021. // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
  1022. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  1023. hparams.n_swa = 0;
  1024. hparams.set_swa_pattern(1);
  1025. }
  1026. } break;
  1027. case LLM_ARCH_PHIMOE:
  1028. {
  1029. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1030. switch (hparams.n_layer) {
  1031. case 32: type = LLM_TYPE_16x3_8B; break;
  1032. default: type = LLM_TYPE_UNKNOWN;
  1033. }
  1034. } break;
  1035. case LLM_ARCH_PLAMO:
  1036. {
  1037. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1038. switch (hparams.n_layer) {
  1039. case 40: type = LLM_TYPE_13B; break;
  1040. default: type = LLM_TYPE_UNKNOWN;
  1041. }
  1042. } break;
  1043. case LLM_ARCH_PLAMO2:
  1044. {
  1045. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1046. // Load Mamba SSM parameters
  1047. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1048. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1049. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1050. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1051. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1052. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1053. hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
  1054. }
  1055. switch (hparams.n_layer) {
  1056. case 16: type = LLM_TYPE_1B; break;
  1057. case 32:
  1058. if (hparams.n_embd == 2048) {
  1059. type = LLM_TYPE_2B;
  1060. } else if (hparams.n_embd == 4096) {
  1061. type = LLM_TYPE_8B;
  1062. }
  1063. break;
  1064. default: type = LLM_TYPE_UNKNOWN;
  1065. }
  1066. // Load attention parameters
  1067. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
  1068. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
  1069. } break;
  1070. case LLM_ARCH_GPT2:
  1071. {
  1072. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1073. switch (hparams.n_layer) {
  1074. case 12: type = LLM_TYPE_SMALL; break;
  1075. case 24: type = LLM_TYPE_MEDIUM; break;
  1076. case 36: type = LLM_TYPE_LARGE; break;
  1077. case 48: type = LLM_TYPE_XL; break;
  1078. default: type = LLM_TYPE_UNKNOWN;
  1079. }
  1080. } break;
  1081. case LLM_ARCH_CODESHELL:
  1082. {
  1083. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1084. switch (hparams.n_layer) {
  1085. case 42: type = LLM_TYPE_7B; break;
  1086. default: type = LLM_TYPE_UNKNOWN;
  1087. }
  1088. } break;
  1089. case LLM_ARCH_ORION:
  1090. {
  1091. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1092. switch (hparams.n_layer) {
  1093. case 40: type = LLM_TYPE_14B; break;
  1094. default: type = LLM_TYPE_UNKNOWN;
  1095. }
  1096. } break;
  1097. case LLM_ARCH_INTERNLM2:
  1098. {
  1099. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1100. switch (hparams.n_layer) {
  1101. case 32: type = LLM_TYPE_7B; break;
  1102. case 48: type = LLM_TYPE_20B; break;
  1103. default: type = LLM_TYPE_UNKNOWN;
  1104. }
  1105. } break;
  1106. case LLM_ARCH_GEMMA:
  1107. {
  1108. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1109. switch (hparams.n_layer) {
  1110. case 18: type = LLM_TYPE_2B; break;
  1111. case 28: type = LLM_TYPE_7B; break;
  1112. default: type = LLM_TYPE_UNKNOWN;
  1113. }
  1114. } break;
  1115. case LLM_ARCH_GEMMA2:
  1116. {
  1117. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1118. hparams.n_swa = 4096; // default value of gemma 2
  1119. hparams.set_swa_pattern(2);
  1120. hparams.attn_soft_cap = true;
  1121. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1122. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1123. ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
  1124. ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
  1125. switch (hparams.n_layer) {
  1126. case 26: type = LLM_TYPE_2B; break;
  1127. case 42: type = LLM_TYPE_9B; break;
  1128. case 46: type = LLM_TYPE_27B; break;
  1129. default: type = LLM_TYPE_UNKNOWN;
  1130. }
  1131. // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
  1132. hparams.f_attention_scale = type == LLM_TYPE_27B
  1133. ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
  1134. : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
  1135. } break;
  1136. case LLM_ARCH_GEMMA3:
  1137. {
  1138. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1139. if (found_swa && hparams.n_swa > 0) {
  1140. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1141. hparams.set_swa_pattern(6);
  1142. hparams.rope_freq_base_train_swa = 10000.0f;
  1143. hparams.rope_freq_scale_train_swa = 1.0f;
  1144. } else {
  1145. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  1146. }
  1147. hparams.f_final_logit_softcapping = 0.0f;
  1148. ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
  1149. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1150. switch (hparams.n_layer) {
  1151. case 18: type = LLM_TYPE_270M; break;
  1152. case 26: type = LLM_TYPE_1B; break;
  1153. case 32: type = LLM_TYPE_8B; break; // Rnj-1
  1154. case 34: type = LLM_TYPE_4B; break;
  1155. case 48: type = LLM_TYPE_12B; break;
  1156. case 62: type = LLM_TYPE_27B; break;
  1157. default: type = LLM_TYPE_UNKNOWN;
  1158. }
  1159. // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
  1160. hparams.f_attention_scale = type == LLM_TYPE_27B
  1161. ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
  1162. : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
  1163. } break;
  1164. case LLM_ARCH_GEMMA3N:
  1165. {
  1166. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1167. hparams.set_swa_pattern(5);
  1168. hparams.n_layer_kv_from_start = 20;
  1169. hparams.rope_freq_base_train_swa = 10000.0f;
  1170. hparams.rope_freq_scale_train_swa = 1.0f;
  1171. hparams.f_attention_scale = 1.0f;
  1172. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1173. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1174. switch (hparams.n_layer) {
  1175. case 30: type = LLM_TYPE_E2B; break;
  1176. case 35: type = LLM_TYPE_E4B; break;
  1177. default: type = LLM_TYPE_UNKNOWN;
  1178. }
  1179. } break;
  1180. case LLM_ARCH_GEMMA_EMBEDDING:
  1181. {
  1182. hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
  1183. hparams.set_swa_pattern(6);
  1184. hparams.causal_attn = false; // embeddings do not use causal attention
  1185. hparams.rope_freq_base_train_swa = 10000.0f;
  1186. hparams.rope_freq_scale_train_swa = 1.0f;
  1187. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1188. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1189. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  1190. //applied only if model converted with --sentence-transformers-dense-modules
  1191. ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
  1192. ml.get_key(LLM_KV_DENSE_2_FEAT_OUT, hparams.dense_2_feat_out, false);
  1193. ml.get_key(LLM_KV_DENSE_3_FEAT_IN, hparams.dense_3_feat_in, false);
  1194. ml.get_key(LLM_KV_DENSE_3_FEAT_OUT, hparams.dense_3_feat_out, false);
  1195. GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd");
  1196. GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd");
  1197. switch (hparams.n_layer) {
  1198. case 24: type = LLM_TYPE_0_3B; break;
  1199. default: type = LLM_TYPE_UNKNOWN;
  1200. }
  1201. hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
  1202. } break;
  1203. case LLM_ARCH_STARCODER2:
  1204. {
  1205. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1206. switch (hparams.n_layer) {
  1207. case 30: type = LLM_TYPE_3B; break;
  1208. case 32: type = LLM_TYPE_7B; break;
  1209. case 40: type = LLM_TYPE_15B; break;
  1210. case 52: type = LLM_TYPE_20B; break; // granite
  1211. case 88: type = LLM_TYPE_34B; break; // granite
  1212. default: type = LLM_TYPE_UNKNOWN;
  1213. }
  1214. } break;
  1215. case LLM_ARCH_MAMBA:
  1216. {
  1217. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1218. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1219. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1220. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1221. ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
  1222. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1223. switch (hparams.n_layer) {
  1224. case 24:
  1225. switch (hparams.n_embd) {
  1226. case 768: type = LLM_TYPE_SMALL; break;
  1227. default: type = LLM_TYPE_UNKNOWN;
  1228. } break;
  1229. case 48:
  1230. switch (hparams.n_embd) {
  1231. case 1024: type = LLM_TYPE_MEDIUM; break;
  1232. case 1536: type = LLM_TYPE_LARGE; break;
  1233. case 2048: type = LLM_TYPE_XL; break;
  1234. default: type = LLM_TYPE_UNKNOWN;
  1235. } break;
  1236. case 64:
  1237. switch (hparams.n_embd) {
  1238. case 2560: type = LLM_TYPE_3B; break;
  1239. default: type = LLM_TYPE_UNKNOWN;
  1240. } break;
  1241. default: type = LLM_TYPE_UNKNOWN;
  1242. }
  1243. } break;
  1244. case LLM_ARCH_MAMBA2:
  1245. {
  1246. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1247. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1248. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1249. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1250. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1251. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1252. switch (hparams.n_layer) {
  1253. case 24:
  1254. switch (hparams.n_embd) {
  1255. case 768: type = LLM_TYPE_SMALL; break;
  1256. default: type = LLM_TYPE_UNKNOWN;
  1257. } break;
  1258. case 48:
  1259. switch (hparams.n_embd) {
  1260. case 1024: type = LLM_TYPE_MEDIUM; break;
  1261. case 1536: type = LLM_TYPE_LARGE; break;
  1262. case 2048: type = LLM_TYPE_XL; break;
  1263. default: type = LLM_TYPE_UNKNOWN;
  1264. } break;
  1265. case 64:
  1266. switch (hparams.n_embd) {
  1267. case 2560: type = LLM_TYPE_3B; break;
  1268. case 4096: type = LLM_TYPE_7B; break;
  1269. default: type = LLM_TYPE_UNKNOWN;
  1270. } break;
  1271. default: type = LLM_TYPE_UNKNOWN;
  1272. }
  1273. } break;
  1274. case LLM_ARCH_JAMBA:
  1275. {
  1276. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1277. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1278. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1279. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1280. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1281. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1282. hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
  1283. }
  1284. switch (hparams.n_layer) {
  1285. // TODO: Jamba layers are a bit heterogenous, so naming this is hard.
  1286. case 12: // 900M 8x???M
  1287. case 32: // 51B 16x?B
  1288. default: type = LLM_TYPE_UNKNOWN;
  1289. }
  1290. } break;
  1291. case LLM_ARCH_XVERSE:
  1292. {
  1293. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1294. switch (hparams.n_layer) {
  1295. case 32: type = LLM_TYPE_7B; break;
  1296. case 40: type = LLM_TYPE_13B; break;
  1297. case 80: type = LLM_TYPE_65B; break;
  1298. default: type = LLM_TYPE_UNKNOWN;
  1299. }
  1300. } break;
  1301. case LLM_ARCH_COMMAND_R:
  1302. {
  1303. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1304. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1305. switch (hparams.n_layer) {
  1306. case 40: type = LLM_TYPE_35B; break;
  1307. default: type = LLM_TYPE_UNKNOWN;
  1308. }
  1309. } break;
  1310. case LLM_ARCH_COHERE2:
  1311. {
  1312. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1313. hparams.set_swa_pattern(4);
  1314. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1315. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1316. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1317. switch (hparams.n_layer) {
  1318. case 32: type = LLM_TYPE_8B; break;
  1319. default: type = LLM_TYPE_UNKNOWN;
  1320. }
  1321. } break;
  1322. case LLM_ARCH_DBRX:
  1323. {
  1324. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1325. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
  1326. switch (hparams.n_layer) {
  1327. case 40: type = LLM_TYPE_16x12B; break;
  1328. default: type = LLM_TYPE_UNKNOWN;
  1329. }
  1330. } break;
  1331. case LLM_ARCH_OLMO:
  1332. {
  1333. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1334. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  1335. switch (hparams.n_layer) {
  1336. case 22: type = LLM_TYPE_1B; break;
  1337. case 32: type = LLM_TYPE_7B; break;
  1338. case 80: type = LLM_TYPE_70B; break;
  1339. default: type = LLM_TYPE_UNKNOWN;
  1340. }
  1341. } break;
  1342. case LLM_ARCH_OLMO2:
  1343. {
  1344. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1345. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1346. if (found_swa && hparams.n_swa > 0) {
  1347. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1348. hparams.set_swa_pattern(4);
  1349. } else {
  1350. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  1351. }
  1352. switch (hparams.n_layer) {
  1353. case 16: type = LLM_TYPE_1B; break;
  1354. case 32: type = LLM_TYPE_7B; break;
  1355. case 40: type = LLM_TYPE_13B; break;
  1356. case 64: type = LLM_TYPE_32B; break;
  1357. default: type = LLM_TYPE_UNKNOWN;
  1358. }
  1359. } break;
  1360. case LLM_ARCH_SEED_OSS:
  1361. {
  1362. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1363. switch (hparams.n_layer) {
  1364. case 64: type = LLM_TYPE_36B; break;
  1365. default: type = LLM_TYPE_UNKNOWN;
  1366. }
  1367. } break;
  1368. case LLM_ARCH_OLMOE:
  1369. {
  1370. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1371. switch (hparams.n_layer) {
  1372. case 16: type = LLM_TYPE_A1_7B; break;
  1373. default: type = LLM_TYPE_UNKNOWN;
  1374. }
  1375. } break;
  1376. case LLM_ARCH_OPENELM:
  1377. {
  1378. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1379. switch (hparams.n_layer) {
  1380. case 16: type = LLM_TYPE_270M; break;
  1381. case 20: type = LLM_TYPE_450M; break;
  1382. case 28: type = LLM_TYPE_1B; break;
  1383. case 36: type = LLM_TYPE_3B; break;
  1384. default: type = LLM_TYPE_UNKNOWN;
  1385. }
  1386. } break;
  1387. case LLM_ARCH_GPTNEOX:
  1388. {
  1389. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1390. ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
  1391. switch (hparams.n_layer) {
  1392. case 6:
  1393. switch (hparams.n_ff()) {
  1394. case 512: type = LLM_TYPE_14M; break;
  1395. case 2048: type = LLM_TYPE_70M; break;
  1396. default: type = LLM_TYPE_UNKNOWN;
  1397. } break;
  1398. case 12:
  1399. switch (hparams.n_ff()) {
  1400. case 3072: type = LLM_TYPE_160M; break;
  1401. default: type = LLM_TYPE_UNKNOWN;
  1402. } break;
  1403. case 16:
  1404. switch (hparams.n_ff()) {
  1405. case 8192: type = LLM_TYPE_1B; break;
  1406. default: type = LLM_TYPE_UNKNOWN;
  1407. } break;
  1408. case 24:
  1409. switch (hparams.n_ff()) {
  1410. case 4096: type = LLM_TYPE_410M; break;
  1411. case 8192: type = LLM_TYPE_1_4B; break;
  1412. default: type = LLM_TYPE_UNKNOWN;
  1413. } break;
  1414. case 32:
  1415. switch (hparams.n_ff()) {
  1416. case 10240: type = LLM_TYPE_2_8B; break;
  1417. case 16384: type = LLM_TYPE_6_9B; break;
  1418. default: type = LLM_TYPE_UNKNOWN;
  1419. } break;
  1420. case 36:
  1421. switch (hparams.n_ff()) {
  1422. case 20480: type = LLM_TYPE_12B; break;
  1423. default: type = LLM_TYPE_UNKNOWN;
  1424. } break;
  1425. case 44:
  1426. switch (hparams.n_ff()) {
  1427. case 24576: type = LLM_TYPE_20B; break;
  1428. default: type = LLM_TYPE_UNKNOWN;
  1429. } break;
  1430. default: type = LLM_TYPE_UNKNOWN;
  1431. }
  1432. } break;
  1433. case LLM_ARCH_ARCTIC:
  1434. {
  1435. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1436. if (hparams.n_expert == 128) {
  1437. switch (hparams.n_layer) {
  1438. case 35: type = LLM_TYPE_10B_128x3_66B; break;
  1439. default: type = LLM_TYPE_UNKNOWN;
  1440. }
  1441. } else {
  1442. type = LLM_TYPE_UNKNOWN;
  1443. }
  1444. } break;
  1445. case LLM_ARCH_DEEPSEEK:
  1446. {
  1447. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1448. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1449. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1450. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1451. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1452. switch (hparams.n_ff_exp) {
  1453. case 1408: type = LLM_TYPE_16B; break;
  1454. case 1792: type = LLM_TYPE_20B; break;
  1455. default: type = LLM_TYPE_UNKNOWN;
  1456. }
  1457. } break;
  1458. case LLM_ARCH_DEEPSEEK2:
  1459. {
  1460. // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
  1461. bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
  1462. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1463. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1464. if (!is_lite) {
  1465. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  1466. }
  1467. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  1468. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
  1469. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
  1470. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1471. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1472. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1473. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1474. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1475. if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
  1476. // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
  1477. // that have no expert_gating_func model parameter set
  1478. hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
  1479. }
  1480. if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
  1481. // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
  1482. // cancel the factor from the convert script
  1483. hparams.rope_yarn_log_mul /= 0.1f;
  1484. }
  1485. // (optional) temperature tuning - used by mistral-large
  1486. ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
  1487. ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
  1488. hparams.f_attn_temp_offset = 0.0f;
  1489. switch (hparams.n_layer) {
  1490. case 27: type = LLM_TYPE_16B; break;
  1491. case 60: type = LLM_TYPE_236B; break;
  1492. case 61: type = LLM_TYPE_671B; break;
  1493. default: type = LLM_TYPE_UNKNOWN;
  1494. }
  1495. } break;
  1496. case LLM_ARCH_PLM:
  1497. {
  1498. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1499. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  1500. switch (hparams.n_layer) {
  1501. case 32: type = LLM_TYPE_1_8B; break;
  1502. default: type = LLM_TYPE_UNKNOWN;
  1503. }
  1504. } break;
  1505. case LLM_ARCH_CHATGLM:
  1506. {
  1507. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1508. switch (hparams.n_layer) {
  1509. case 28: {
  1510. if (hparams.n_head(0) == 16) {
  1511. type = LLM_TYPE_1_5B;
  1512. } else {
  1513. type = LLM_TYPE_6B;
  1514. }
  1515. } break;
  1516. case 40: {
  1517. if (hparams.n_head(0) == 24) {
  1518. type = LLM_TYPE_4B;
  1519. } else {
  1520. type = LLM_TYPE_9B;
  1521. }
  1522. } break;
  1523. default: type = LLM_TYPE_UNKNOWN;
  1524. }
  1525. } break;
  1526. case LLM_ARCH_GLM4:
  1527. {
  1528. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1529. switch (hparams.n_layer) {
  1530. case 40: type = LLM_TYPE_9B; break;
  1531. case 61: type = LLM_TYPE_32B; break;
  1532. default: type = LLM_TYPE_UNKNOWN;
  1533. }
  1534. } break;
  1535. case LLM_ARCH_GLM4_MOE:
  1536. {
  1537. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1538. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1539. // MoE parameters
  1540. ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
  1541. ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
  1542. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1543. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
  1544. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1545. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1546. // Expert gating function (GLM-4.5 uses sigmoid)
  1547. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1548. if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
  1549. hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
  1550. }
  1551. // NextN/MTP parameters
  1552. ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
  1553. // TODO: when MTP is implemented, this should probably be updated if needed
  1554. hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
  1555. switch (hparams.n_layer) {
  1556. case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
  1557. case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
  1558. default: type = LLM_TYPE_UNKNOWN;
  1559. }
  1560. } break;
  1561. case LLM_ARCH_BITNET:
  1562. {
  1563. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1564. switch (hparams.n_layer) {
  1565. case 26: type = LLM_TYPE_3B; break;
  1566. default: type = LLM_TYPE_UNKNOWN;
  1567. }
  1568. } break;
  1569. case LLM_ARCH_T5:
  1570. {
  1571. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1572. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  1573. uint32_t dec_start_token_id;
  1574. if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
  1575. hparams.dec_start_token_id = dec_start_token_id;
  1576. }
  1577. hparams.dec_n_layer = hparams.n_layer;
  1578. ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false);
  1579. switch (hparams.n_layer) {
  1580. case 6: type = LLM_TYPE_60M; break; // t5-small
  1581. case 8: type = LLM_TYPE_80M; break; // flan-t5-small
  1582. case 12:
  1583. switch (hparams.n_ff()) {
  1584. case 3072: type = LLM_TYPE_220M; break; // t5-base
  1585. case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
  1586. default: type = LLM_TYPE_UNKNOWN;
  1587. } break;
  1588. case 24:
  1589. switch (hparams.n_ff()) {
  1590. case 4096: type = LLM_TYPE_770M; break; // t5-large
  1591. case 2816: type = LLM_TYPE_780M; break; // flan-t5-large
  1592. case 16384: type = LLM_TYPE_3B; break; // t5-3b
  1593. case 5120: type = LLM_TYPE_3B; break; // flan-t5-xl
  1594. case 65536: type = LLM_TYPE_11B; break; // t5-11b
  1595. case 10240: type = LLM_TYPE_11B; break; // flan-t5-xxl
  1596. default: type = LLM_TYPE_UNKNOWN;
  1597. } break;
  1598. default: type = LLM_TYPE_UNKNOWN;
  1599. }
  1600. } break;
  1601. case LLM_ARCH_T5ENCODER:
  1602. {
  1603. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1604. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  1605. type = LLM_TYPE_UNKNOWN;
  1606. } break;
  1607. case LLM_ARCH_JAIS:
  1608. {
  1609. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1610. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  1611. switch (hparams.n_layer) {
  1612. case 24: type = LLM_TYPE_1_3B; break;
  1613. case 40: type = LLM_TYPE_13B; break;
  1614. /* TODO: add variants */
  1615. default: type = LLM_TYPE_UNKNOWN;
  1616. }
  1617. } break;
  1618. case LLM_ARCH_NEMOTRON:
  1619. {
  1620. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1621. switch (hparams.n_layer) {
  1622. case 32: type = LLM_TYPE_4B; break;
  1623. default: type = LLM_TYPE_UNKNOWN;
  1624. }
  1625. } break;
  1626. case LLM_ARCH_NEMOTRON_H:
  1627. {
  1628. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1629. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1630. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1631. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1632. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1633. // A layer is recurrent IFF the n_head_kv value is set to 0 and
  1634. // the n_ff value is set to 0
  1635. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1636. hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
  1637. }
  1638. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1639. switch (hparams.n_layer) {
  1640. case 56: type = LLM_TYPE_9B; break;
  1641. default: type = LLM_TYPE_UNKNOWN;
  1642. }
  1643. } break;
  1644. case LLM_ARCH_EXAONE:
  1645. {
  1646. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1647. switch (hparams.n_layer) {
  1648. case 32: type = LLM_TYPE_8B; break;
  1649. default: type = LLM_TYPE_UNKNOWN;
  1650. }
  1651. } break;
  1652. case LLM_ARCH_EXAONE4:
  1653. {
  1654. if (hparams.n_layer == 64) { // 32B
  1655. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1656. hparams.n_swa = 4096;
  1657. hparams.set_swa_pattern(4);
  1658. }
  1659. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1660. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1661. switch (hparams.n_layer) {
  1662. case 30: type = LLM_TYPE_1_2B; break;
  1663. case 64: type = LLM_TYPE_32B; break;
  1664. default: type = LLM_TYPE_UNKNOWN;
  1665. }
  1666. } break;
  1667. case LLM_ARCH_RWKV6:
  1668. case LLM_ARCH_RWKV6QWEN2:
  1669. {
  1670. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
  1671. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
  1672. ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
  1673. ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
  1674. ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
  1675. ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
  1676. ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
  1677. switch (hparams.n_layer) {
  1678. case 24: type = LLM_TYPE_1_6B; break;
  1679. case 32:
  1680. switch (hparams.n_embd) {
  1681. case 2560: type = LLM_TYPE_3B; break;
  1682. case 4096: type = LLM_TYPE_7B; break;
  1683. default: type = LLM_TYPE_UNKNOWN;
  1684. } break;
  1685. case 61: type = LLM_TYPE_14B; break;
  1686. case 64: type = LLM_TYPE_32B; break;
  1687. default: type = LLM_TYPE_UNKNOWN;
  1688. }
  1689. } break;
  1690. case LLM_ARCH_RWKV7:
  1691. case LLM_ARCH_ARWKV7:
  1692. {
  1693. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
  1694. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
  1695. ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
  1696. ml.get_key(LLM_KV_ATTENTION_DECAY_LORA_RANK, hparams.n_lora_decay);
  1697. ml.get_key(LLM_KV_ATTENTION_ICLR_LORA_RANK, hparams.n_lora_iclr);
  1698. ml.get_key(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix);
  1699. ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate, false);
  1700. ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
  1701. switch (hparams.n_layer) {
  1702. case 12:
  1703. switch (hparams.n_embd) {
  1704. case 768: type = LLM_TYPE_190M; break;
  1705. default: type = LLM_TYPE_UNKNOWN;
  1706. } break;
  1707. case 24:
  1708. switch (hparams.n_embd) {
  1709. case 1024: type = LLM_TYPE_450M; break;
  1710. case 2048: type = LLM_TYPE_1_5B; break;
  1711. default: type = LLM_TYPE_UNKNOWN;
  1712. } break;
  1713. case 28:
  1714. switch (hparams.n_embd) {
  1715. case 1536: type = LLM_TYPE_1_5B; break;
  1716. case 3584: type = LLM_TYPE_7B; break;
  1717. default: type = LLM_TYPE_UNKNOWN;
  1718. } break;
  1719. case 32:
  1720. switch (hparams.n_embd) {
  1721. case 2560: type = LLM_TYPE_2_9B; break;
  1722. case 4096: type = LLM_TYPE_7B; break;
  1723. default: type = LLM_TYPE_UNKNOWN;
  1724. } break;
  1725. case 61:
  1726. switch (hparams.n_embd) {
  1727. case 4096: type = LLM_TYPE_14B; break;
  1728. default: type = LLM_TYPE_UNKNOWN;
  1729. } break;
  1730. default: type = LLM_TYPE_UNKNOWN;
  1731. }
  1732. } break;
  1733. case LLM_ARCH_GRANITE:
  1734. case LLM_ARCH_GRANITE_MOE:
  1735. {
  1736. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1737. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1738. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  1739. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  1740. ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
  1741. // Granite uses rope_finetuned as a switch for rope, so default to true
  1742. bool rope_finetuned = true;
  1743. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  1744. hparams.rope_finetuned = rope_finetuned;
  1745. switch (hparams.n_layer) {
  1746. case 32: type = LLM_TYPE_3B; break;
  1747. case 40: type = LLM_TYPE_3B; break;
  1748. // Add additional layer/vocab/etc checks here for other model sizes
  1749. default: type = LLM_TYPE_UNKNOWN;
  1750. }
  1751. // For Granite MoE Shared
  1752. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
  1753. } break;
  1754. case LLM_ARCH_GRANITE_HYBRID:
  1755. {
  1756. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1757. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /* required */ false);
  1758. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /* required */ false);
  1759. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /* required */ false);
  1760. ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, /* required */ false);
  1761. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1762. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1763. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1764. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1765. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1766. // Granite uses rope_finetuned as a switch for rope, so default to true
  1767. bool rope_finetuned = true;
  1768. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  1769. hparams.rope_finetuned = rope_finetuned;
  1770. // A layer is recurrent IFF the n_head_kv value is set to 0
  1771. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1772. hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
  1773. }
  1774. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1775. switch (hparams.n_embd) {
  1776. case 768: type = LLM_TYPE_350M; break;
  1777. case 1536: type = (hparams.n_embd == 2048 ? LLM_TYPE_7B_A1B : LLM_TYPE_1B); break;
  1778. case 2048: case 2560: type = LLM_TYPE_3B; break;
  1779. case 4096: type = LLM_TYPE_32B; break;
  1780. default: type = LLM_TYPE_UNKNOWN;
  1781. }
  1782. // For Granite MoE Shared
  1783. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
  1784. } break;
  1785. case LLM_ARCH_CHAMELEON:
  1786. {
  1787. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1788. hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
  1789. ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
  1790. switch (hparams.n_layer) {
  1791. case 32: type = LLM_TYPE_7B; break;
  1792. case 48: type = LLM_TYPE_34B; break;
  1793. default: type = LLM_TYPE_UNKNOWN;
  1794. }
  1795. } break;
  1796. case LLM_ARCH_WAVTOKENIZER_DEC:
  1797. {
  1798. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1799. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
  1800. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
  1801. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  1802. } break;
  1803. case LLM_ARCH_BAILINGMOE:
  1804. {
  1805. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1806. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1807. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1808. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1809. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1810. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1811. switch (hparams.n_layer) {
  1812. case 28: type = LLM_TYPE_16B; break;
  1813. case 88: type = LLM_TYPE_290B; break;
  1814. default: type = LLM_TYPE_UNKNOWN;
  1815. }
  1816. } break;
  1817. case LLM_ARCH_BAILINGMOE2:
  1818. {
  1819. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1820. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1821. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1822. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
  1823. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1824. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1825. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1826. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
  1827. ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
  1828. // TODO: when MTP is implemented, this should probably be updated if needed
  1829. hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
  1830. switch (hparams.n_layer) {
  1831. case 20: type = LLM_TYPE_16B_A1B; break;
  1832. case 21: type = LLM_TYPE_16B_A1B; break;
  1833. case 32: type = LLM_TYPE_100B_A6B; break;
  1834. case 33: type = LLM_TYPE_100B_A6B; break;
  1835. default: type = LLM_TYPE_UNKNOWN;
  1836. }
  1837. } break;
  1838. case LLM_ARCH_DOTS1:
  1839. {
  1840. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1841. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1842. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1843. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1844. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1845. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1846. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1847. switch (hparams.n_layer) {
  1848. case 62: type = LLM_TYPE_142B; break;
  1849. default: type = LLM_TYPE_UNKNOWN;
  1850. }
  1851. } break;
  1852. case LLM_ARCH_ERNIE4_5:
  1853. case LLM_ARCH_ERNIE4_5_MOE:
  1854. {
  1855. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1856. if (arch == LLM_ARCH_ERNIE4_5_MOE) {
  1857. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1858. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  1859. ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
  1860. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1861. }
  1862. switch (hparams.n_layer) {
  1863. case 18: type = LLM_TYPE_0_3B; break;
  1864. case 28: type = LLM_TYPE_21B_A3B; break;
  1865. case 54: type = LLM_TYPE_300B_A47B; break;
  1866. default: type = LLM_TYPE_UNKNOWN;
  1867. }
  1868. } break;
  1869. case LLM_ARCH_FALCON_H1:
  1870. {
  1871. // Common parameters
  1872. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1873. // SSM parameters
  1874. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1875. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1876. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1877. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1878. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1879. std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
  1880. switch (hparams.n_layer) {
  1881. case 36:
  1882. type = LLM_TYPE_0_5B; break;
  1883. case 24:
  1884. type = LLM_TYPE_1_5B; break;
  1885. case 66:
  1886. type = LLM_TYPE_1B; break;
  1887. case 32:
  1888. type = LLM_TYPE_3B; break;
  1889. case 44:
  1890. type = LLM_TYPE_7B; break;
  1891. case 72:
  1892. type = LLM_TYPE_34B; break;
  1893. default:
  1894. type = LLM_TYPE_UNKNOWN;
  1895. }
  1896. } break;
  1897. case LLM_ARCH_HUNYUAN_MOE:
  1898. {
  1899. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1900. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1901. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
  1902. switch (hparams.n_layer) {
  1903. case 32: type = LLM_TYPE_A13B; break;
  1904. default: type = LLM_TYPE_UNKNOWN;
  1905. }
  1906. } break;
  1907. case LLM_ARCH_HUNYUAN_DENSE:
  1908. {
  1909. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1910. switch (hparams.n_embd) {
  1911. case 1024: type = LLM_TYPE_0_5B; break;
  1912. case 2048: type = LLM_TYPE_1_8B; break;
  1913. case 3072: type = LLM_TYPE_4B; break;
  1914. case 4096: type = LLM_TYPE_7B; break;
  1915. default: type = LLM_TYPE_UNKNOWN;
  1916. }
  1917. } break;
  1918. case LLM_ARCH_SMOLLM3:
  1919. {
  1920. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1921. hparams.n_no_rope_layer_step = 4;
  1922. switch (hparams.n_layer) {
  1923. case 36: type = LLM_TYPE_3B; break;
  1924. default: type = LLM_TYPE_UNKNOWN;
  1925. }
  1926. } break;
  1927. case LLM_ARCH_OPENAI_MOE:
  1928. {
  1929. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1930. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1931. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1932. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1933. hparams.set_swa_pattern(2);
  1934. switch (hparams.n_layer) {
  1935. case 24: type = LLM_TYPE_20B; break;
  1936. case 36: type = LLM_TYPE_120B; break;
  1937. default: type = LLM_TYPE_UNKNOWN;
  1938. }
  1939. } break;
  1940. case LLM_ARCH_LFM2:
  1941. {
  1942. ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
  1943. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1944. for (uint32_t il = 0; il < hparams.n_layer; ++il) {
  1945. hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
  1946. }
  1947. hparams.n_layer_dense_lead = hparams.n_layer;
  1948. switch (hparams.n_ff()) {
  1949. case 4608: type = LLM_TYPE_350M; break;
  1950. case 6912: type = LLM_TYPE_700M; break;
  1951. case 8192: type = LLM_TYPE_1_2B; break;
  1952. case 10752: type = LLM_TYPE_2_6B; break;
  1953. default: type = LLM_TYPE_UNKNOWN;
  1954. }
  1955. } break;
  1956. case LLM_ARCH_LFM2MOE:
  1957. {
  1958. ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
  1959. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1960. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1961. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1962. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
  1963. for (uint32_t il = 0; il < hparams.n_layer; ++il) {
  1964. hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
  1965. }
  1966. type = LLM_TYPE_8B_A1B;
  1967. } break;
  1968. case LLM_ARCH_SMALLTHINKER:
  1969. {
  1970. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1971. if (found_swa && hparams.n_swa > 0) {
  1972. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1973. hparams.n_swa = 4096;
  1974. hparams.set_swa_pattern(4, true);
  1975. } else {
  1976. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  1977. hparams.n_no_rope_layer_step = hparams.n_layer;
  1978. }
  1979. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  1980. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1981. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1982. switch (hparams.n_layer) {
  1983. case 32: type = LLM_TYPE_4B; break;
  1984. case 52: type = LLM_TYPE_20B; break;
  1985. default: type = LLM_TYPE_UNKNOWN;
  1986. }
  1987. } break;
  1988. case LLM_ARCH_GROVEMOE:
  1989. {
  1990. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1991. ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, hparams.n_ff_chexp);
  1992. ml.get_key(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale);
  1993. ml.get_key(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts);
  1994. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1995. switch (hparams.n_layer) {
  1996. case 48: type = LLM_TYPE_30B_A3B; break;
  1997. default: type = LLM_TYPE_UNKNOWN;
  1998. }
  1999. } break;
  2000. case LLM_ARCH_APERTUS:
  2001. {
  2002. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  2003. ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer);
  2004. ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer);
  2005. ml.get_key_or_arr(LLM_KV_XIELU_BETA, hparams.xielu_beta, hparams.n_layer);
  2006. ml.get_key_or_arr(LLM_KV_XIELU_EPS, hparams.xielu_eps, hparams.n_layer);
  2007. switch (hparams.n_layer) {
  2008. case 32: type = LLM_TYPE_8B; break;
  2009. default: type = LLM_TYPE_UNKNOWN;
  2010. }
  2011. } break;
  2012. case LLM_ARCH_MINIMAX_M2:
  2013. {
  2014. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  2015. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  2016. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  2017. switch (hparams.n_layer) {
  2018. case 62: type = LLM_TYPE_230B_A10B; break;
  2019. default: type = LLM_TYPE_UNKNOWN;
  2020. }
  2021. } break;
  2022. case LLM_ARCH_COGVLM:
  2023. {
  2024. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  2025. switch (hparams.n_layer) {
  2026. case 32: type = LLM_TYPE_13B; break;
  2027. default: type = LLM_TYPE_UNKNOWN;
  2028. }
  2029. } break;
  2030. case LLM_ARCH_PANGU_EMBED:
  2031. {
  2032. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  2033. switch (hparams.n_layer) {
  2034. case 26: type = LLM_TYPE_1B; break; // openPangu-Embedded-1B-V1.1
  2035. case 34: type = LLM_TYPE_7B; break; // openPangu-Embedded-7B-V1.1
  2036. default: type = LLM_TYPE_UNKNOWN;
  2037. }
  2038. } break;
  2039. case LLM_ARCH_QWEN3NEXT:
  2040. {
  2041. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  2042. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  2043. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  2044. // Load linear attention (gated delta net) parameters
  2045. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  2046. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  2047. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  2048. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  2049. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  2050. // Mark recurrent layers (linear attention layers)
  2051. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  2052. hparams.recurrent_layer_arr[i] = ((i + 1) % 4 != 0); // TODO: extract the magic 4 from "full_attention_interval"
  2053. }
  2054. switch (hparams.n_layer) {
  2055. case 48: type = LLM_TYPE_80B_A3B; break;
  2056. default: type = LLM_TYPE_UNKNOWN;
  2057. }
  2058. } break;
  2059. case LLM_ARCH_MISTRAL3:
  2060. {
  2061. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  2062. ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
  2063. ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
  2064. ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
  2065. ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f);
  2066. hparams.f_attn_temp_offset = 0.0f;
  2067. // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
  2068. if (hparams.f_attn_temp_scale != 0.0f) {
  2069. hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn;
  2070. if (hparams.n_attn_temp_floor_scale == 0) {
  2071. throw std::runtime_error("invalid n_ctx_orig_yarn for attention temperature scaling");
  2072. }
  2073. }
  2074. switch (hparams.n_layer) {
  2075. case 26: type = LLM_TYPE_3B; break;
  2076. case 34: type = LLM_TYPE_8B; break;
  2077. case 40: type = LLM_TYPE_14B; break;
  2078. default: type = LLM_TYPE_UNKNOWN;
  2079. }
  2080. } break;
  2081. default: throw std::runtime_error("unsupported model architecture");
  2082. }
  2083. pimpl->n_bytes = ml.n_bytes;
  2084. pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
  2085. if (hparams.f_max_alibi_bias > 0.0f) {
  2086. hparams.use_alibi = true;
  2087. }
  2088. hparams.rope_type = llama_model_rope_type(this);
  2089. }
  2090. void llama_model::load_vocab(llama_model_loader & ml) {
  2091. const auto kv = LLM_KV(arch);
  2092. vocab.load(ml, kv);
  2093. }
  2094. bool llama_model::load_tensors(llama_model_loader & ml) {
  2095. const auto & split_mode = params.split_mode;
  2096. const auto & n_gpu_layers = params.n_gpu_layers;
  2097. const auto & use_mlock = params.use_mlock;
  2098. const auto & tensor_split = params.tensor_split;
  2099. const int n_layer = hparams.n_layer;
  2100. const bool use_mmap_buffer = true;
  2101. LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
  2102. // build a list of buffer types for the CPU and GPU devices
  2103. pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
  2104. for (auto * dev : devices) {
  2105. buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
  2106. // add CPU buffer types as a fallback
  2107. buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
  2108. pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
  2109. }
  2110. // calculate the split points
  2111. bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
  2112. std::vector<float> splits(n_devices());
  2113. if (all_zero) {
  2114. // default split, by free memory
  2115. for (size_t i = 0; i < n_devices(); ++i) {
  2116. ggml_backend_dev_t dev = devices[i];
  2117. size_t total;
  2118. size_t free;
  2119. ggml_backend_dev_memory(dev, &free, &total);
  2120. splits[i] = free;
  2121. }
  2122. } else {
  2123. std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
  2124. }
  2125. // sum and normalize the splits to get the split points
  2126. float split_sum = 0.0f;
  2127. for (size_t i = 0; i < n_devices(); ++i) {
  2128. split_sum += splits[i];
  2129. splits[i] = split_sum;
  2130. }
  2131. for (size_t i = 0; i < n_devices(); ++i) {
  2132. splits[i] /= split_sum;
  2133. }
  2134. ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  2135. if (cpu_dev == nullptr) {
  2136. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  2137. }
  2138. const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
  2139. const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
  2140. auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
  2141. const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
  2142. if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
  2143. LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
  2144. return {cpu_dev, &pimpl->cpu_buft_list};
  2145. }
  2146. const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
  2147. auto * dev = devices.at(layer_gpu);
  2148. LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
  2149. return {dev, &pimpl->gpu_buft_list.at(dev)};
  2150. };
  2151. // assign the input layer
  2152. // there is very little benefit to offloading the input layer, so always keep it on the CPU
  2153. pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
  2154. // assign the repeating layers to the devices according to the splits
  2155. pimpl->dev_layer.resize(n_layer);
  2156. for (int il = 0; il < n_layer; ++il) {
  2157. pimpl->dev_layer[il] = get_layer_buft_list(il);
  2158. }
  2159. // assign the output layer
  2160. pimpl->dev_output = get_layer_buft_list(n_layer);
  2161. // one ggml context per buffer type
  2162. int max_n_tensors = ml.n_tensors;
  2163. max_n_tensors += 1; // duplicated output tensor
  2164. max_n_tensors += n_layer*2; // duplicated rope freq tensors
  2165. const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
  2166. // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
  2167. struct ggml_backend_buft_comparator {
  2168. bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
  2169. return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
  2170. }
  2171. };
  2172. std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
  2173. auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
  2174. auto it = ctx_map.find(buft);
  2175. if (it == ctx_map.end()) {
  2176. ggml_init_params params = {
  2177. /*.mem_size =*/ ctx_size,
  2178. /*.mem_buffer =*/ NULL,
  2179. /*.no_alloc =*/ true,
  2180. };
  2181. ggml_context * ctx = ggml_init(params);
  2182. if (!ctx) {
  2183. throw std::runtime_error(format("failed to create ggml context"));
  2184. }
  2185. ctx_map.emplace(buft, ctx);
  2186. return ctx;
  2187. }
  2188. return it->second.get();
  2189. };
  2190. const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
  2191. const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
  2192. const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
  2193. // create tensors for the weights
  2194. {
  2195. // note: cast to int64_t since we will use these for the tensor dimensions
  2196. const int64_t n_head = hparams.n_head();
  2197. const int64_t n_head_kv = hparams.n_head_kv();
  2198. const int64_t n_embd = hparams.n_embd;
  2199. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
  2200. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
  2201. const int64_t n_embd_head_k = hparams.n_embd_head_k;
  2202. const int64_t n_embd_head_v = hparams.n_embd_head_v;
  2203. const int64_t n_ff = hparams.n_ff();
  2204. const int64_t n_embd_gqa = n_embd_v_gqa;
  2205. const int64_t n_vocab = vocab.n_tokens();
  2206. const int64_t n_token_types = vocab.n_token_types();
  2207. const int64_t n_rot = hparams.n_rot;
  2208. const int64_t n_expert = hparams.n_expert;
  2209. const int64_t n_expert_used = hparams.n_expert_used;
  2210. const int64_t n_ctx_train = hparams.n_ctx_train;
  2211. if (n_expert > 0 && hparams.n_expert_used == 0) {
  2212. throw std::runtime_error("model has expert layers but no expert layers are used");
  2213. }
  2214. int n_moved_tensors = 0;
  2215. ggml_tensor * first_moved_tensor = nullptr;
  2216. ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
  2217. ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
  2218. auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
  2219. ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
  2220. if (!t_meta) {
  2221. if (flags & TENSOR_NOT_REQUIRED) {
  2222. return nullptr;
  2223. }
  2224. throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
  2225. }
  2226. // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
  2227. // the tensor is duplicated
  2228. // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
  2229. llm_tensor tn_tensor = tn.tensor;
  2230. if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
  2231. tn_tensor = LLM_TENSOR_OUTPUT;
  2232. }
  2233. llm_tensor_info info;
  2234. try {
  2235. info = llm_tensor_info_for(tn_tensor);
  2236. } catch (const std::out_of_range & e) {
  2237. throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
  2238. }
  2239. // skip unused tensors
  2240. if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
  2241. const size_t nbytes = ggml_nbytes(t_meta);
  2242. LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
  2243. ml.size_data -= nbytes;
  2244. ml.n_created++;
  2245. return nullptr;
  2246. }
  2247. // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
  2248. ggml_op op;
  2249. bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
  2250. if (bias) {
  2251. if (info.op == GGML_OP_MUL_MAT_ID) {
  2252. op = GGML_OP_ADD_ID;
  2253. } else {
  2254. op = GGML_OP_ADD;
  2255. }
  2256. } else {
  2257. op = info.op;
  2258. }
  2259. // sanity checks
  2260. if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
  2261. if (tn.bid != -1) {
  2262. GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
  2263. }
  2264. } else {
  2265. if (tn.bid == -1) {
  2266. GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
  2267. }
  2268. }
  2269. // select the buffer type for this tensor
  2270. buft_list_t * buft_list;
  2271. switch (info.layer) {
  2272. case LLM_TENSOR_LAYER_INPUT:
  2273. buft_list = pimpl->dev_input.buft_list;
  2274. break;
  2275. case LLM_TENSOR_LAYER_OUTPUT:
  2276. buft_list = pimpl->dev_output.buft_list;
  2277. break;
  2278. case LLM_TENSOR_LAYER_REPEATING:
  2279. buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
  2280. break;
  2281. default:
  2282. GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
  2283. }
  2284. ggml_backend_buffer_type_t buft = nullptr;
  2285. // check overrides
  2286. if (ml.tensor_buft_overrides) {
  2287. std::string tensor_name = tn.str();
  2288. for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
  2289. std::regex pattern(overrides->pattern);
  2290. if (std::regex_search(tensor_name, pattern)) {
  2291. if (overrides->buft == ggml_backend_cpu_buffer_type()) {
  2292. // when overriding to a CPU buffer, consider the extra buffer types
  2293. buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
  2294. } else {
  2295. buft = overrides->buft;
  2296. }
  2297. LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
  2298. tensor_name.c_str(),
  2299. ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
  2300. ggml_backend_buft_name(buft));
  2301. break;
  2302. }
  2303. }
  2304. }
  2305. if (!buft) {
  2306. buft = select_weight_buft(hparams, t_meta, op, *buft_list);
  2307. if (!buft) {
  2308. throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
  2309. }
  2310. }
  2311. // avoid using a host buffer when using mmap
  2312. auto * buft_dev = ggml_backend_buft_get_device(buft);
  2313. if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
  2314. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  2315. if (!cpu_dev) {
  2316. throw std::runtime_error("no CPU backend found");
  2317. }
  2318. buft = ggml_backend_dev_buffer_type(cpu_dev);
  2319. }
  2320. if (buft != buft_list->front().second) {
  2321. n_moved_tensors++;
  2322. if (!first_moved_tensor) {
  2323. first_moved_tensor = t_meta;
  2324. first_moved_from_buft = buft_list->front().second;
  2325. first_moved_to_buft = buft;
  2326. }
  2327. }
  2328. ggml_context * ctx = ctx_for_buft(buft);
  2329. // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
  2330. if (flags & TENSOR_DUPLICATED) {
  2331. ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
  2332. if (t) {
  2333. return t;
  2334. }
  2335. }
  2336. return ml.create_tensor(ctx, tn, ne, flags);
  2337. };
  2338. layers.resize(n_layer);
  2339. // TODO: move to a separate function
  2340. const auto tn = LLM_TN(arch);
  2341. switch (arch) {
  2342. case LLM_ARCH_LLAMA:
  2343. case LLM_ARCH_REFACT:
  2344. case LLM_ARCH_MINICPM:
  2345. case LLM_ARCH_GRANITE:
  2346. case LLM_ARCH_GRANITE_MOE:
  2347. case LLM_ARCH_MISTRAL3:
  2348. {
  2349. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2350. // output
  2351. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2352. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2353. // if output is NULL, init from the input tok embed
  2354. if (output == NULL) {
  2355. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2356. }
  2357. for (int i = 0; i < n_layer; ++i) {
  2358. auto & layer = layers[i];
  2359. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2360. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2361. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2362. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2363. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2364. // optional bias tensors
  2365. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2366. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2367. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2368. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2369. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2370. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  2371. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2372. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2373. }
  2374. else {
  2375. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2376. }
  2377. if (n_expert == 0) {
  2378. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2379. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2380. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2381. // optional MLP bias
  2382. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2383. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2384. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2385. } else {
  2386. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2387. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
  2388. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  2389. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2390. // For Granite MoE Shared
  2391. if (hparams.n_ff_shexp > 0) {
  2392. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  2393. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  2394. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
  2395. }
  2396. }
  2397. }
  2398. } break;
  2399. case LLM_ARCH_LLADA:
  2400. {
  2401. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2402. // output
  2403. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2404. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
  2405. // if output is NULL, init from the input tok embed
  2406. if (output == NULL) {
  2407. output =
  2408. create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
  2409. }
  2410. for (int i = 0; i < n_layer; ++i) {
  2411. auto & layer = layers[i];
  2412. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  2413. // Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
  2414. layer.wq =
  2415. create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
  2416. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
  2417. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
  2418. // No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
  2419. layer.wo =
  2420. create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
  2421. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
  2422. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  2423. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
  2424. TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2425. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
  2426. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  2427. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
  2428. // optional MLP bias
  2429. layer.ffn_gate_b =
  2430. create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
  2431. layer.ffn_down_b =
  2432. create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
  2433. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
  2434. }
  2435. }
  2436. break;
  2437. case LLM_ARCH_LLADA_MOE:
  2438. {
  2439. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2440. // output
  2441. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2442. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2443. GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe");
  2444. GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe");
  2445. for (int i = 0; i < n_layer; ++i) {
  2446. auto & layer = layers[i];
  2447. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2448. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2449. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2450. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2451. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2452. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2453. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2454. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2455. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2456. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2457. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2458. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2459. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2460. }
  2461. } break;
  2462. case LLM_ARCH_LLAMA4:
  2463. {
  2464. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2465. // output
  2466. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2467. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2468. // if output is NULL, init from the input tok embed
  2469. if (output == NULL) {
  2470. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2471. }
  2472. for (int i = 0; i < n_layer; ++i) {
  2473. bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
  2474. auto & layer = layers[i];
  2475. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2476. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2477. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2478. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2479. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2480. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2481. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2482. if (is_moe_layer) {
  2483. int n_ff_exp = hparams.n_ff_exp;
  2484. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2485. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  2486. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
  2487. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  2488. // Shared expert
  2489. const int64_t n_ff_shexp = n_ff_exp;
  2490. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2491. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
  2492. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2493. } else {
  2494. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2495. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2496. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2497. }
  2498. }
  2499. } break;
  2500. case LLM_ARCH_DECI:
  2501. {
  2502. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2503. // output
  2504. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2505. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2506. // if output is NULL, init from the input tok embed
  2507. if (output == NULL) {
  2508. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2509. }
  2510. for (int i = 0; i < n_layer; ++i) {
  2511. auto & layer = layers[i];
  2512. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
  2513. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
  2514. const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
  2515. const int64_t n_ff = hparams.n_ff(i);
  2516. const int64_t n_head = hparams.n_head(i);
  2517. const int64_t n_head_kv = hparams.n_head_kv(i);
  2518. if (n_head_kv == 0 && n_head > 0) {
  2519. // linear attention for DeciLMCausalModel
  2520. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2521. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2522. }
  2523. else if (n_head_kv > 0) {
  2524. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2525. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2526. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2527. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2528. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2529. }
  2530. // optional bias tensors
  2531. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2532. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2533. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2534. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2535. if (n_ff > 0) {
  2536. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2537. }
  2538. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  2539. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2540. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2541. }
  2542. else {
  2543. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2544. }
  2545. if (n_ff > 0) {
  2546. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2547. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2548. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2549. }
  2550. // optional MLP bias
  2551. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2552. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2553. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2554. }
  2555. } break;
  2556. case LLM_ARCH_MINICPM3:
  2557. {
  2558. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  2559. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  2560. const int64_t q_lora_rank = hparams.n_lora_q;
  2561. const int64_t kv_lora_rank = hparams.n_lora_kv;
  2562. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2563. // output
  2564. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2565. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2566. // if output is NULL, init from the input tok embed
  2567. if (output == NULL) {
  2568. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2569. }
  2570. for (int i = 0; i < n_layer; ++i) {
  2571. auto & layer = layers[i];
  2572. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2573. layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
  2574. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  2575. layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
  2576. layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
  2577. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
  2578. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
  2579. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
  2580. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2581. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2582. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2583. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2584. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2585. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2586. }
  2587. } break;
  2588. case LLM_ARCH_GROK:
  2589. {
  2590. if (n_expert == 0) {
  2591. throw std::runtime_error("Grok model cannot have zero experts");
  2592. }
  2593. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2594. // output
  2595. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2596. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2597. // if output is NULL, init from the input tok embed
  2598. if (output == NULL) {
  2599. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2600. }
  2601. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff/* / n_expert_used*/; // grok-1 n_ff_exp == n_ff
  2602. for (int i = 0; i < n_layer; ++i) {
  2603. auto & layer = layers[i];
  2604. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2605. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2606. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2607. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2608. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2609. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  2610. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2611. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2612. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, TENSOR_NOT_REQUIRED);
  2613. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2614. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2615. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
  2616. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2617. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  2618. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2619. if (!layer.ffn_post_norm) {
  2620. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  2621. }
  2622. }
  2623. } break;
  2624. case LLM_ARCH_DBRX:
  2625. {
  2626. if (n_expert == 0) {
  2627. throw std::runtime_error("DBRX model cannot have zero experts");
  2628. }
  2629. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2630. // output
  2631. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2632. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2633. for (int i = 0; i < n_layer; ++i) {
  2634. auto & layer = layers[i];
  2635. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2636. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2637. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2638. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  2639. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2640. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2641. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  2642. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2643. }
  2644. } break;
  2645. case LLM_ARCH_BAICHUAN:
  2646. {
  2647. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2648. {
  2649. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2650. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2651. }
  2652. for (int i = 0; i < n_layer; ++i) {
  2653. auto & layer = layers[i];
  2654. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2655. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2656. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2657. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2658. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2659. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2660. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2661. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2662. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2663. }
  2664. } break;
  2665. case LLM_ARCH_FALCON:
  2666. {
  2667. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2668. // output
  2669. {
  2670. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2671. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2672. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2673. if (!output) {
  2674. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
  2675. }
  2676. }
  2677. for (int i = 0; i < n_layer; ++i) {
  2678. auto & layer = layers[i];
  2679. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2680. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2681. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2682. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2683. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2684. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2685. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2686. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2687. }
  2688. } break;
  2689. case LLM_ARCH_STARCODER:
  2690. {
  2691. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2692. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  2693. // output
  2694. {
  2695. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2696. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2697. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2698. if (!output) {
  2699. // needs to be on GPU
  2700. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2701. }
  2702. }
  2703. for (int i = 0; i < n_layer; ++i) {
  2704. auto & layer = layers[i];
  2705. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2706. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2707. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2708. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2709. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2710. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2711. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2712. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2713. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2714. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2715. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2716. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2717. }
  2718. } break;
  2719. case LLM_ARCH_BERT:
  2720. case LLM_ARCH_NOMIC_BERT:
  2721. case LLM_ARCH_NOMIC_BERT_MOE:
  2722. case LLM_ARCH_JINA_BERT_V3:
  2723. {
  2724. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2725. type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
  2726. if (arch == LLM_ARCH_BERT) {
  2727. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  2728. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
  2729. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  2730. cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2731. cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2732. }
  2733. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  2734. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  2735. for (int i = 0; i < n_layer; ++i) {
  2736. auto & layer = layers[i];
  2737. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2738. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2739. if (!layer.wqkv) {
  2740. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2741. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2742. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2743. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2744. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2745. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2746. }
  2747. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2748. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2749. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  2750. layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
  2751. if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
  2752. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
  2753. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  2754. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2755. } else {
  2756. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2757. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2758. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2759. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2760. if (arch == LLM_ARCH_NOMIC_BERT) {
  2761. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2762. }
  2763. }
  2764. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  2765. layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
  2766. }
  2767. } break;
  2768. case LLM_ARCH_NEO_BERT:
  2769. {
  2770. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2771. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
  2772. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  2773. cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2774. cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2775. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2776. for (int i = 0; i < n_layer; ++i) {
  2777. auto & layer = layers[i];
  2778. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2779. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2780. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2781. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2782. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff*2}, 0);
  2783. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2784. }
  2785. } break;
  2786. case LLM_ARCH_JINA_BERT_V2:
  2787. {
  2788. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
  2789. type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
  2790. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
  2791. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); //LayerNorm bias
  2792. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
  2793. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {1}, TENSOR_NOT_REQUIRED);
  2794. for (int i = 0; i < n_layer; ++i) {
  2795. auto & layer = layers[i]; // JinaBertLayer
  2796. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2797. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2798. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2799. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2800. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2801. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2802. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2803. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2804. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2805. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2806. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
  2807. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
  2808. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
  2809. layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
  2810. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2811. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2812. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2813. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
  2814. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2815. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2816. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  2817. layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
  2818. }
  2819. } break;
  2820. case LLM_ARCH_BLOOM:
  2821. {
  2822. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2823. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  2824. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  2825. // output
  2826. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2827. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2828. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2829. // if output is NULL, init from the input tok embed
  2830. if (output == NULL) {
  2831. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2832. }
  2833. for (int i = 0; i < n_layer; ++i) {
  2834. auto & layer = layers[i];
  2835. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2836. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2837. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2838. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2839. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2840. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2841. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2842. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2843. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2844. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2845. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2846. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2847. }
  2848. } break;
  2849. case LLM_ARCH_MPT:
  2850. {
  2851. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2852. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
  2853. // output
  2854. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2855. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  2856. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2857. if (!output) {
  2858. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
  2859. }
  2860. for (int i = 0; i < n_layer; ++i) {
  2861. auto & layer = layers[i];
  2862. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2863. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2864. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2865. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2866. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2867. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2868. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2869. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2870. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2871. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2872. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2873. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2874. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2875. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2876. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2877. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2878. // AWQ ScaleActivation layer
  2879. layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2880. }
  2881. } break;
  2882. case LLM_ARCH_STABLELM:
  2883. {
  2884. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2885. // output
  2886. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2887. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2888. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2889. for (int i = 0; i < n_layer; ++i) {
  2890. auto & layer = layers[i];
  2891. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2892. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2893. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2894. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2895. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2896. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2897. // optional bias tensors, present in Stable LM 2 1.6B
  2898. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2899. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2900. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2901. // optional q and k layernorms, present in StableLM 2 12B
  2902. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
  2903. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
  2904. // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
  2905. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2906. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2907. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2908. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2909. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2910. }
  2911. } break;
  2912. case LLM_ARCH_QWEN:
  2913. {
  2914. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2915. // output
  2916. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2917. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2918. for (int i = 0; i < n_layer; ++i) {
  2919. auto & layer = layers[i];
  2920. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2921. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
  2922. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
  2923. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2924. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2925. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
  2926. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
  2927. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}, 0);
  2928. }
  2929. } break;
  2930. case LLM_ARCH_QWEN2:
  2931. case LLM_ARCH_QWEN2VL:
  2932. case LLM_ARCH_DREAM:
  2933. {
  2934. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2935. // output
  2936. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2937. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2938. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, TENSOR_NOT_REQUIRED);
  2939. // if output is NULL, init from the input tok embed
  2940. if (output == NULL) {
  2941. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2942. }
  2943. for (int i = 0; i < n_layer; ++i) {
  2944. auto & layer = layers[i];
  2945. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2946. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2947. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2948. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2949. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2950. // optional bias tensors
  2951. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2952. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2953. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2954. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2955. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2956. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2957. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2958. }
  2959. } break;
  2960. case LLM_ARCH_QWEN2MOE:
  2961. {
  2962. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2963. // output
  2964. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2965. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2966. for (int i = 0; i < n_layer; ++i) {
  2967. auto & layer = layers[i];
  2968. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2969. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2970. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2971. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2972. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2973. // optional bias tensors
  2974. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2975. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2976. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2977. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2978. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2979. if (n_expert == 0) {
  2980. throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
  2981. }
  2982. if (n_expert_used == 0) {
  2983. throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
  2984. }
  2985. // MoE branch
  2986. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2987. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2988. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2989. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2990. // Shared expert branch
  2991. const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
  2992. layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
  2993. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2994. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
  2995. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2996. }
  2997. } break;
  2998. case LLM_ARCH_QWEN3:
  2999. case LLM_ARCH_QWEN3VL:
  3000. {
  3001. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3002. // output
  3003. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3004. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3005. // if output is NULL, init from the input tok embed
  3006. if (output == NULL) {
  3007. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3008. }
  3009. // output rerank head
  3010. cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  3011. for (int i = 0; i < n_layer; ++i) {
  3012. auto & layer = layers[i];
  3013. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3014. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3015. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3016. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3017. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3018. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  3019. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  3020. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3021. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3022. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3023. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3024. }
  3025. } break;
  3026. case LLM_ARCH_QWEN3MOE:
  3027. case LLM_ARCH_QWEN3VLMOE:
  3028. case LLM_ARCH_RND1:
  3029. {
  3030. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3031. // output
  3032. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3033. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3034. // if output is NULL, init from the input tok embed
  3035. if (output == NULL) {
  3036. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3037. }
  3038. for (int i = 0; i < n_layer; ++i) {
  3039. auto & layer = layers[i];
  3040. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3041. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3042. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3043. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3044. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3045. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  3046. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  3047. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3048. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3049. if (n_expert == 0) {
  3050. throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
  3051. }
  3052. if (n_expert_used == 0) {
  3053. throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
  3054. }
  3055. // MoE branch
  3056. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  3057. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3058. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  3059. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3060. }
  3061. } break;
  3062. case LLM_ARCH_PHI2:
  3063. {
  3064. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3065. // output
  3066. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3067. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3068. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3069. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, 0);
  3070. for (int i = 0; i < n_layer; ++i) {
  3071. auto & layer = layers[i];
  3072. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3073. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3074. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3075. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3076. if (layer.wqkv == nullptr) {
  3077. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3078. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  3079. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3080. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  3081. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3082. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  3083. }
  3084. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3085. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3086. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3087. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3088. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3089. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  3090. }
  3091. } break;
  3092. case LLM_ARCH_PHI3:
  3093. {
  3094. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  3095. // output
  3096. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  3097. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3098. // if output is NULL, init from the input tok embed
  3099. if (output == NULL) {
  3100. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3101. }
  3102. for (int i = 0; i < n_layer; ++i) {
  3103. auto & layer = layers[i];
  3104. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  3105. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
  3106. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  3107. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  3108. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  3109. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
  3110. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3111. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3112. }
  3113. } break;
  3114. case LLM_ARCH_PHIMOE:
  3115. {
  3116. const int64_t n_embd_head = n_embd / n_head;
  3117. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  3118. // output
  3119. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  3120. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3121. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
  3122. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0);
  3123. for (int i = 0; i < n_layer; ++i) {
  3124. auto & layer = layers[i];
  3125. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  3126. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
  3127. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
  3128. if (layer.wqkv == nullptr) {
  3129. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3130. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  3131. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3132. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  3133. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3134. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  3135. }
  3136. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  3137. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
  3138. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  3139. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
  3140. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3141. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3142. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  3143. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3144. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3145. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3146. }
  3147. } break;
  3148. case LLM_ARCH_PLAMO:
  3149. {
  3150. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3151. // output
  3152. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3153. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3154. for (int i = 0; i < n_layer; ++i) {
  3155. auto & layer = layers[i];
  3156. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3157. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3158. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3159. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3160. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3161. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3162. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3163. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3164. }
  3165. } break;
  3166. case LLM_ARCH_PLAMO2:
  3167. {
  3168. // mamba parameters
  3169. const uint32_t d_conv = hparams.ssm_d_conv;
  3170. const uint32_t d_state = hparams.ssm_d_state;
  3171. const uint32_t num_heads = hparams.ssm_dt_rank;
  3172. const uint32_t intermediate_size = hparams.ssm_d_inner;
  3173. const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
  3174. // attention parameters
  3175. const uint32_t qk_dim = hparams.n_embd_head_k;
  3176. const uint32_t v_dim = hparams.n_embd_head_v;
  3177. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3178. // output
  3179. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3180. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3181. // if output is NULL, init from the input tok embed
  3182. if (output == NULL) {
  3183. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3184. }
  3185. for (int i = 0; i < n_layer; ++i) {
  3186. auto & layer = layers[i];
  3187. bool is_mamba_layer = hparams.is_recurrent(i);
  3188. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3189. if (is_mamba_layer) {
  3190. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2 * intermediate_size}, 0);
  3191. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, intermediate_size}, 0);
  3192. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {intermediate_size, dt_dim + 2*d_state}, 0);
  3193. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_dim, num_heads}, 0);
  3194. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {num_heads}, 0);
  3195. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {num_heads}, 0);
  3196. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {num_heads}, 0);
  3197. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {intermediate_size, n_embd}, 0);
  3198. layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, i), {dt_dim}, 0);
  3199. layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
  3200. layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
  3201. } else {
  3202. const int64_t num_attention_heads = hparams.n_head(i);
  3203. const int64_t q_num_heads = num_attention_heads;
  3204. const int64_t num_key_value_heads = hparams.n_head_kv(i);
  3205. const int64_t k_num_heads = num_key_value_heads;
  3206. const int64_t v_num_heads = num_key_value_heads;
  3207. const int64_t q_proj_dim = q_num_heads * qk_dim;
  3208. const int64_t k_proj_dim = k_num_heads * qk_dim;
  3209. const int64_t v_proj_dim = v_num_heads * v_dim;
  3210. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
  3211. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {qk_dim, num_attention_heads}, 0);
  3212. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {qk_dim, k_num_heads}, 0);
  3213. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
  3214. }
  3215. // All layers have post-attention norm, FFN norm, and FFN tensors
  3216. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
  3217. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3218. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3219. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  3220. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
  3221. }
  3222. } break;
  3223. case LLM_ARCH_GPT2:
  3224. {
  3225. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3226. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  3227. // output
  3228. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3229. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3230. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3231. // if output is NULL, init from the input tok embed
  3232. if (output == NULL) {
  3233. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3234. }
  3235. for (int i = 0; i < n_layer; ++i) {
  3236. auto & layer = layers[i];
  3237. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3238. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3239. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3240. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  3241. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3242. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3243. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3244. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3245. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3246. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3247. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3248. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  3249. }
  3250. } break;
  3251. case LLM_ARCH_CODESHELL:
  3252. {
  3253. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3254. // if tok embd is NULL, init from output
  3255. if (tok_embd == NULL) {
  3256. tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3257. }
  3258. // output
  3259. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3260. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3261. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3262. for (int i = 0; i < n_layer; ++i) {
  3263. auto & layer = layers[i];
  3264. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3265. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3266. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3267. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  3268. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3269. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3270. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3271. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3272. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3273. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3274. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3275. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  3276. }
  3277. } break;
  3278. case LLM_ARCH_ORION:
  3279. {
  3280. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3281. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3282. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3283. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3284. for (int i = 0; i < n_layer; ++i) {
  3285. auto & layer = layers[i];
  3286. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3287. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3288. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3289. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3290. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3291. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3292. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3293. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3294. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3295. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3296. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3297. }
  3298. } break;
  3299. case LLM_ARCH_INTERNLM2:
  3300. {
  3301. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3302. // output
  3303. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3304. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3305. for (int i = 0; i < n_layer; ++i) {
  3306. auto & layer = layers[i];
  3307. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3308. // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3309. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3310. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3311. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3312. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3313. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3314. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3315. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3316. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3317. }
  3318. } break;
  3319. case LLM_ARCH_GEMMA:
  3320. {
  3321. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3322. // output
  3323. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3324. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  3325. for (int i = 0; i < n_layer; ++i) {
  3326. auto & layer = layers[i];
  3327. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3328. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3329. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3330. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3331. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3332. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3333. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3334. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3335. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3336. }
  3337. } break;
  3338. case LLM_ARCH_GEMMA2:
  3339. {
  3340. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3341. // output
  3342. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3343. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  3344. for (int i = 0; i < n_layer; ++i) {
  3345. auto & layer = layers[i];
  3346. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3347. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3348. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3349. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3350. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3351. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3352. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3353. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3354. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3355. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3356. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3357. }
  3358. } break;
  3359. case LLM_ARCH_GEMMA3:
  3360. case LLM_ARCH_GEMMA_EMBEDDING:
  3361. {
  3362. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3363. // output
  3364. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3365. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3366. // if output is NULL, init from the input tok embed
  3367. if (output == NULL) {
  3368. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3369. }
  3370. // Dense linear weights
  3371. dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.dense_2_feat_out}, TENSOR_NOT_REQUIRED);
  3372. dense_3_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_3_OUT, "weight"), {hparams.dense_3_feat_in, n_embd}, TENSOR_NOT_REQUIRED);
  3373. for (int i = 0; i < n_layer; ++i) {
  3374. auto & layer = layers[i];
  3375. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3376. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3377. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3378. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3379. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3380. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3381. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  3382. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  3383. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3384. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3385. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3386. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3387. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3388. }
  3389. } break;
  3390. case LLM_ARCH_GEMMA3N:
  3391. {
  3392. const int64_t n_altup = hparams.n_altup;
  3393. const int64_t laurel_rank = hparams.laurel_rank;
  3394. const int64_t n_embd_altup = hparams.n_embd_altup;
  3395. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3396. // if output is NULL, init from the input tok embed
  3397. if (output == NULL) {
  3398. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3399. }
  3400. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3401. tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0);
  3402. altup_proj = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
  3403. altup_unembd_proj = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
  3404. per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_altup * n_layer}, 0);
  3405. per_layer_proj_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM, "weight"), {n_embd_altup}, 0);
  3406. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3407. for (int i = 0; i < n_layer; ++i) {
  3408. auto & layer = layers[i];
  3409. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3410. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3411. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3412. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3413. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3414. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  3415. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  3416. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3417. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3418. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3419. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3420. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3421. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3422. // altup & laurel
  3423. layer.per_layer_inp_gate = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE, "weight", i), {n_embd, n_embd_altup}, 0);
  3424. layer.per_layer_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ, "weight", i), {n_embd_altup, n_embd}, 0);
  3425. layer.per_layer_post_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
  3426. layer.altup_correct_coef = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF, "weight", i), {n_altup, n_altup}, 0);
  3427. layer.altup_correct_scale = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0);
  3428. layer.altup_predict_coef = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF, "weight", i), {n_altup, n_altup * n_altup}, 0);
  3429. layer.altup_router = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER, "weight", i), {n_embd, n_altup}, 0);
  3430. layer.altup_router_norm = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM, "weight", i), {n_embd}, 0);
  3431. layer.laurel_l = create_tensor(tn(LLM_TENSOR_LAUREL_L, "weight", i), {n_embd, laurel_rank}, 0);
  3432. layer.laurel_r = create_tensor(tn(LLM_TENSOR_LAUREL_R, "weight", i), {laurel_rank, n_embd}, 0);
  3433. layer.laurel_post_norm = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM, "weight", i), {n_embd}, 0);
  3434. }
  3435. } break;
  3436. case LLM_ARCH_STARCODER2:
  3437. {
  3438. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3439. // output
  3440. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3441. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3442. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3443. // if output is NULL, init from the input tok embed
  3444. if (output == NULL) {
  3445. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3446. }
  3447. for (int i = 0; i < n_layer; ++i) {
  3448. auto & layer = layers[i];
  3449. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3450. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3451. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3452. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3453. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3454. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3455. // optional bias tensors
  3456. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  3457. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  3458. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  3459. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3460. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3461. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3462. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3463. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3464. // optional bias tensors
  3465. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3466. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}, 0);
  3467. }
  3468. } break;
  3469. case LLM_ARCH_MAMBA:
  3470. {
  3471. const int64_t d_conv = hparams.ssm_d_conv;
  3472. const int64_t d_inner = hparams.ssm_d_inner;
  3473. const int64_t d_state = hparams.ssm_d_state;
  3474. const int64_t dt_rank = hparams.ssm_dt_rank;
  3475. // only an expansion factor of 2 is supported for now
  3476. if (2 * n_embd != d_inner) {
  3477. throw std::runtime_error("only an expansion factor of 2 is supported for now");
  3478. }
  3479. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3480. // output
  3481. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3482. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3483. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3484. if (output == NULL) {
  3485. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3486. }
  3487. for (int i = 0; i < n_layer; ++i) {
  3488. auto & layer = layers[i];
  3489. // norm
  3490. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3491. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
  3492. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
  3493. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
  3494. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
  3495. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
  3496. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
  3497. // no "weight" suffix for these
  3498. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
  3499. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
  3500. // out_proj
  3501. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3502. }
  3503. } break;
  3504. case LLM_ARCH_MAMBA2:
  3505. {
  3506. const int64_t d_conv = hparams.ssm_d_conv;
  3507. const int64_t d_inner = hparams.ssm_d_inner;
  3508. const int64_t d_state = hparams.ssm_d_state;
  3509. const int64_t n_head = hparams.ssm_dt_rank;
  3510. const int64_t n_group = hparams.ssm_n_group;
  3511. const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
  3512. // only an expansion factor of 2 is supported for now
  3513. GGML_ASSERT(2 * n_embd == d_inner);
  3514. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3515. // output
  3516. {
  3517. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3518. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3519. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3520. if (output == NULL) {
  3521. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3522. }
  3523. }
  3524. for (int i = 0; i < n_layer; ++i) {
  3525. auto & layer = layers[i];
  3526. // norm
  3527. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3528. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
  3529. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
  3530. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);
  3531. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
  3532. // no "weight" suffix for these
  3533. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
  3534. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
  3535. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
  3536. // out_proj
  3537. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3538. }
  3539. } break;
  3540. case LLM_ARCH_JAMBA:
  3541. {
  3542. const int64_t d_conv = hparams.ssm_d_conv;
  3543. const int64_t d_inner = hparams.ssm_d_inner;
  3544. const int64_t d_state = hparams.ssm_d_state;
  3545. const int64_t dt_rank = hparams.ssm_dt_rank;
  3546. // only an expansion factor of 2 is supported for now
  3547. GGML_ASSERT(2 * n_embd == d_inner);
  3548. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3549. // output
  3550. {
  3551. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3552. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3553. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3554. if (output == NULL) {
  3555. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3556. }
  3557. }
  3558. for (int i = 0; i < n_layer; ++i) {
  3559. const int64_t n_head_kv = hparams.n_head_kv(i);
  3560. const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
  3561. auto & layer = layers[i];
  3562. // norm
  3563. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3564. if (n_head_kv == 0) {
  3565. // Mamba layer
  3566. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
  3567. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
  3568. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
  3569. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
  3570. layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, "weight", i), {dt_rank}, 0);
  3571. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
  3572. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
  3573. layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, "weight", i), {d_state}, 0);
  3574. layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, "weight", i), {d_state}, 0);
  3575. // no "weight" suffix for these
  3576. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
  3577. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
  3578. // out_proj
  3579. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3580. } else {
  3581. // Attention layers
  3582. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3583. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3584. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3585. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3586. }
  3587. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3588. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
  3589. if (layer.ffn_gate_inp) {
  3590. // MoE
  3591. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3592. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  3593. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3594. } else {
  3595. // FFN (no MoE)
  3596. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3597. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3598. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3599. }
  3600. }
  3601. } break;
  3602. case LLM_ARCH_GRANITE_HYBRID:
  3603. {
  3604. // mamba2 Mixer SSM params
  3605. // NOTE: int64_t for tensor dimensions
  3606. const int64_t d_conv = hparams.ssm_d_conv;
  3607. const int64_t d_inner = hparams.ssm_d_inner;
  3608. const int64_t d_state = hparams.ssm_d_state;
  3609. const int64_t n_ssm_head = hparams.ssm_dt_rank;
  3610. const int64_t n_group = hparams.ssm_n_group;
  3611. const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
  3612. // only an expansion factor of 2 is supported for now
  3613. GGML_ASSERT(2 * n_embd == d_inner);
  3614. // embeddings
  3615. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3616. // output
  3617. {
  3618. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3619. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3620. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3621. if (output == NULL) {
  3622. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3623. }
  3624. }
  3625. for (int i = 0; i < n_layer; ++i) {
  3626. auto & layer = layers[i];
  3627. // norm
  3628. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3629. if (hparams.is_recurrent(i)) {
  3630. // ssm layers
  3631. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
  3632. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
  3633. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
  3634. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
  3635. // no "weight" suffix for these
  3636. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
  3637. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
  3638. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
  3639. // out_proj
  3640. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3641. } else {
  3642. // attention layers (with optional bias)
  3643. const int64_t n_head_i = hparams.n_head(i);
  3644. const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
  3645. const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
  3646. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
  3647. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
  3648. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
  3649. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
  3650. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3651. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
  3652. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
  3653. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3654. }
  3655. // feed forward (w/ optional biases)
  3656. if (n_expert > 0) {
  3657. // MoE FFN
  3658. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3659. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3660. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3661. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
  3662. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  3663. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3664. // For Granite MoE Shared
  3665. if (hparams.n_ff_shexp > 0) {
  3666. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  3667. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  3668. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
  3669. }
  3670. } else {
  3671. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3672. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3673. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3674. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3675. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3676. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  3677. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3678. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  3679. }
  3680. }
  3681. } break;
  3682. case LLM_ARCH_XVERSE:
  3683. {
  3684. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3685. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3686. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3687. for (int i = 0; i < n_layer; ++i) {
  3688. auto & layer = layers[i];
  3689. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3690. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3691. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3692. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3693. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3694. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3695. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3696. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3697. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3698. }
  3699. } break;
  3700. case LLM_ARCH_COMMAND_R:
  3701. {
  3702. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3703. // output
  3704. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3705. // init output from the input tok embed
  3706. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3707. for (int i = 0; i < n_layer; ++i) {
  3708. auto & layer = layers[i];
  3709. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3710. if (n_layer >= 64){
  3711. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
  3712. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
  3713. }
  3714. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3715. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3716. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3717. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3718. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3719. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3720. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3721. }
  3722. } break;
  3723. case LLM_ARCH_COHERE2:
  3724. {
  3725. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  3726. // output
  3727. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  3728. // init output from the input tok embed
  3729. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
  3730. TENSOR_DUPLICATED);
  3731. for (int i = 0; i < n_layer; ++i) {
  3732. auto & layer = layers[i];
  3733. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  3734. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
  3735. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
  3736. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
  3737. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  3738. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
  3739. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  3740. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
  3741. }
  3742. }
  3743. break;
  3744. case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
  3745. {
  3746. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3747. // output
  3748. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3749. // if output is NULL, init from the input tok embed
  3750. if (output == NULL) {
  3751. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3752. }
  3753. for (int i = 0; i < n_layer; ++i) {
  3754. auto & layer = layers[i];
  3755. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3756. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3757. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3758. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3759. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3760. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3761. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3762. }
  3763. } break;
  3764. case LLM_ARCH_OLMO2:
  3765. {
  3766. const int64_t n_embd_head = n_embd / n_head;
  3767. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3768. // output
  3769. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3770. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3771. for (int i = 0; i < n_layer; ++i) {
  3772. auto & layer = layers[i];
  3773. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3774. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3775. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3776. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3777. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
  3778. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
  3779. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3780. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3781. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3782. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3783. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3784. }
  3785. } break;
  3786. case LLM_ARCH_SEED_OSS:
  3787. {
  3788. const uint32_t head_dim = hparams.n_embd_head_k;
  3789. const int64_t n_qo_dim = n_head * head_dim;
  3790. const int64_t n_kv_dim = n_head_kv * head_dim;
  3791. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3792. // output
  3793. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3794. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3795. // if output is NULL, init from the input tok embed
  3796. if (output == NULL) {
  3797. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3798. }
  3799. for (int i = 0; i < n_layer; ++i) {
  3800. auto & layer = layers[i];
  3801. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, 0);
  3802. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, 0);
  3803. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, 0);
  3804. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
  3805. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_qo_dim}, TENSOR_NOT_REQUIRED);
  3806. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
  3807. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
  3808. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3809. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3810. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3811. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3812. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3813. }
  3814. } break;
  3815. case LLM_ARCH_OLMOE:
  3816. {
  3817. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3818. // output
  3819. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3820. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3821. for (int i = 0; i < n_layer; ++i) {
  3822. auto & layer = layers[i];
  3823. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3824. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3825. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3826. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3827. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3828. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
  3829. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
  3830. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3831. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3832. if (n_expert == 0) {
  3833. throw std::runtime_error("n_expert must be > 0");
  3834. }
  3835. if (n_expert_used == 0) {
  3836. throw std::runtime_error("n_expert_used must be > 0");
  3837. }
  3838. // MoE branch
  3839. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3840. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  3841. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3842. }
  3843. } break;
  3844. case LLM_ARCH_OPENELM:
  3845. {
  3846. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3847. // output
  3848. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3849. // init output from the input tok embed
  3850. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3851. for (int i = 0; i < n_layer; ++i) {
  3852. const int64_t n_head = hparams.n_head(i);
  3853. const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head;
  3854. const int64_t n_ff = hparams.n_ff(i);
  3855. auto & layer = layers[i];
  3856. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3857. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
  3858. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  3859. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  3860. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
  3861. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3862. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3863. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3864. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3865. }
  3866. } break;
  3867. case LLM_ARCH_GPTNEOX:
  3868. {
  3869. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3870. // output
  3871. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3872. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3873. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3874. for (int i = 0; i < n_layer; ++i) {
  3875. auto & layer = layers[i];
  3876. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3877. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3878. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3879. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  3880. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3881. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3882. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3883. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3884. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3885. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3886. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3887. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  3888. }
  3889. } break;
  3890. case LLM_ARCH_ARCTIC:
  3891. {
  3892. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3893. // output
  3894. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3895. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3896. // if output is NULL, init from the input tok embed
  3897. if (output == NULL) {
  3898. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3899. }
  3900. for (int i = 0; i < n_layer; ++i) {
  3901. auto & layer = layers[i];
  3902. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3903. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3904. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3905. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3906. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3907. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3908. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
  3909. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
  3910. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd}, 0);
  3911. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3912. layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
  3913. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
  3914. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  3915. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3916. }
  3917. } break;
  3918. case LLM_ARCH_DEEPSEEK:
  3919. {
  3920. const int64_t n_ff_exp = hparams.n_ff_exp;
  3921. const int64_t n_expert_shared = hparams.n_expert_shared;
  3922. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3923. // output
  3924. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3925. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3926. for (int i = 0; i < n_layer; ++i) {
  3927. auto & layer = layers[i];
  3928. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3929. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3930. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3931. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3932. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3933. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3934. if (i < (int) hparams.n_layer_dense_lead) {
  3935. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3936. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3937. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3938. } else {
  3939. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3940. if (n_expert == 0) {
  3941. throw std::runtime_error("n_expert must be > 0");
  3942. }
  3943. if (n_expert_used == 0) {
  3944. throw std::runtime_error("n_expert_used must be > 0");
  3945. }
  3946. // MoE branch
  3947. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3948. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  3949. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3950. // Shared expert branch
  3951. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3952. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  3953. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3954. }
  3955. }
  3956. } break;
  3957. case LLM_ARCH_DEEPSEEK2:
  3958. {
  3959. // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
  3960. const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
  3961. const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
  3962. // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
  3963. const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
  3964. const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
  3965. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  3966. const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
  3967. const int64_t q_lora_rank = hparams.n_lora_q;
  3968. const int64_t kv_lora_rank = hparams.n_lora_kv;
  3969. const int64_t n_ff_exp = hparams.n_ff_exp;
  3970. const int64_t n_expert_shared = hparams.n_expert_shared;
  3971. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3972. // output
  3973. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3974. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3975. for (int i = 0; i < n_layer; ++i) {
  3976. auto & layer = layers[i];
  3977. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3978. if (!is_lite) {
  3979. layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
  3980. }
  3981. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  3982. if (!is_lite) {
  3983. layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
  3984. layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
  3985. } else {
  3986. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
  3987. }
  3988. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
  3989. // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
  3990. if (is_mla) {
  3991. layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
  3992. layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
  3993. } else {
  3994. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
  3995. }
  3996. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
  3997. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3998. if (i < (int) hparams.n_layer_dense_lead) {
  3999. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4000. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4001. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4002. } else {
  4003. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4004. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  4005. if (n_expert == 0) {
  4006. throw std::runtime_error("n_expert must be > 0");
  4007. }
  4008. if (n_expert_used == 0) {
  4009. throw std::runtime_error("n_expert_used must be > 0");
  4010. }
  4011. // MoE branch
  4012. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4013. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  4014. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4015. // Shared expert branch
  4016. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4017. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  4018. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4019. }
  4020. }
  4021. } break;
  4022. case LLM_ARCH_PLM:
  4023. {
  4024. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  4025. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  4026. const int64_t kv_lora_rank = hparams.n_lora_kv;
  4027. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4028. // output
  4029. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4030. // output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4031. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4032. for (int i = 0; i < n_layer; ++i) {
  4033. auto & layer = layers[i];
  4034. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4035. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4036. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
  4037. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  4038. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
  4039. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
  4040. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4041. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4042. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4043. }
  4044. } break;
  4045. case LLM_ARCH_BITNET:
  4046. {
  4047. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4048. // output
  4049. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4050. for (int i = 0; i < n_layer; ++i) {
  4051. auto & layer = layers[i];
  4052. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4053. layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
  4054. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  4055. layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  4056. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  4057. layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  4058. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  4059. layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  4060. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4061. layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  4062. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4063. layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
  4064. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4065. layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  4066. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  4067. layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  4068. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4069. layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  4070. }
  4071. } break;
  4072. case LLM_ARCH_T5:
  4073. {
  4074. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  4075. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4076. // output
  4077. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4078. output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4079. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4080. // if output is NULL, init from the input tok embed
  4081. if (output == NULL) {
  4082. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4083. }
  4084. // n_layer: number of encoder_layers
  4085. // dec_n_layer: number of decoder_layers
  4086. const int dec_n_layer = hparams.dec_n_layer;
  4087. if (dec_n_layer > n_layer) {
  4088. layers.resize(dec_n_layer);
  4089. }
  4090. // load encoder layers
  4091. for (int i = 0; i < n_layer; ++i) {
  4092. auto & layer = layers[i];
  4093. layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
  4094. layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  4095. layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4096. layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4097. layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4098. layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  4099. layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
  4100. layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  4101. layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4102. layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4103. }
  4104. // load decoder layers
  4105. for (int i = 0; i < dec_n_layer; ++i) {
  4106. auto & layer = layers[i];
  4107. layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
  4108. layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  4109. layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4110. layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4111. layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4112. layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  4113. layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
  4114. // this tensor seems to be unused in HF transformers implementation
  4115. layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  4116. layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4117. layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4118. layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4119. layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  4120. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
  4121. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  4122. layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4123. layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4124. }
  4125. } break;
  4126. case LLM_ARCH_T5ENCODER:
  4127. {
  4128. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  4129. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4130. // output
  4131. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4132. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4133. // if output is NULL, init from the input tok embed
  4134. if (output == NULL) {
  4135. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4136. }
  4137. for (int i = 0; i < n_layer; ++i) {
  4138. auto & layer = layers[i];
  4139. layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
  4140. layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  4141. layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4142. layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4143. layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4144. layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  4145. layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
  4146. layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  4147. layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4148. layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4149. }
  4150. } break;
  4151. case LLM_ARCH_JAIS:
  4152. {
  4153. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4154. // output
  4155. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4156. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4157. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4158. for (int i = 0; i < n_layer; ++i) {
  4159. auto & layer = layers[i];
  4160. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4161. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  4162. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  4163. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  4164. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4165. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  4166. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4167. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  4168. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  4169. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  4170. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4171. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, 0);
  4172. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4173. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  4174. }
  4175. } break;
  4176. case LLM_ARCH_CHATGLM:
  4177. {
  4178. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4179. // output
  4180. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4181. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4182. // if output is NULL, init from the input tok embed
  4183. if (output == NULL) {
  4184. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4185. }
  4186. for (int i = 0; i < n_layer; ++i) {
  4187. auto & layer = layers[i];
  4188. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4189. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4190. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4191. if (layer.wqkv == nullptr) {
  4192. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4193. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4194. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4195. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4196. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4197. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4198. }
  4199. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4200. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4201. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  4202. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  4203. }
  4204. } break;
  4205. case LLM_ARCH_GLM4:
  4206. {
  4207. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4208. // output
  4209. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4210. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4211. // if output is NULL, init from the input tok embed
  4212. if (output == NULL) {
  4213. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4214. }
  4215. for (int i = 0; i < n_layer; ++i) {
  4216. auto & layer = layers[i];
  4217. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4218. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4219. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4220. if (layer.wqkv == nullptr) {
  4221. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4222. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4223. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4224. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4225. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4226. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4227. }
  4228. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4229. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  4230. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4231. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4232. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  4233. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  4234. }
  4235. } break;
  4236. case LLM_ARCH_GLM4_MOE:
  4237. {
  4238. const int64_t n_expert = hparams.n_expert;
  4239. const int64_t n_expert_used = hparams.n_expert_used;
  4240. const int64_t n_expert_shared = hparams.n_expert_shared;
  4241. GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
  4242. GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
  4243. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  4244. // output
  4245. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  4246. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
  4247. // if output is NULL, init from the input tok embed
  4248. if (output == NULL) {
  4249. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
  4250. }
  4251. // Load ALL tensors including NextN layer to satisfy total tensor count
  4252. // but only PROCESS up to last layer (skipping final NextN layer) in forward pass
  4253. for (int i = 0; i < n_layer; ++i) {
  4254. int flags = 0;
  4255. if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
  4256. // skip all tensors in the NextN layers
  4257. flags |= TENSOR_SKIP;
  4258. }
  4259. auto & layer = layers[i];
  4260. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
  4261. // GLM-style attention with bias terms
  4262. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
  4263. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
  4264. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
  4265. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
  4266. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
  4267. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
  4268. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
  4269. // K/Q norm tensors (optional for GLM-4.5 355B variant)
  4270. layer.attn_q_norm = create_tensor(
  4271. tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
  4272. layer.attn_k_norm = create_tensor(
  4273. tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
  4274. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
  4275. // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
  4276. // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
  4277. const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
  4278. if (use_moe) {
  4279. // MoE layers
  4280. layer.ffn_gate_inp =
  4281. create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
  4282. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
  4283. // MoE branch
  4284. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  4285. layer.ffn_gate_exps = create_tensor(
  4286. tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
  4287. layer.ffn_down_exps = create_tensor(
  4288. tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
  4289. layer.ffn_up_exps = create_tensor(
  4290. tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
  4291. // Shared expert
  4292. if (n_expert_shared > 0) {
  4293. const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
  4294. layer.ffn_gate_shexp = create_tensor(
  4295. tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
  4296. layer.ffn_down_shexp = create_tensor(
  4297. tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
  4298. layer.ffn_up_shexp = create_tensor(
  4299. tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
  4300. }
  4301. } else {
  4302. // Dense layers (first k layers) - GLM uses separate gate/up projections
  4303. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
  4304. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
  4305. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags);
  4306. }
  4307. // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
  4308. if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
  4309. layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
  4310. layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
  4311. layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
  4312. // Optional tensors
  4313. layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
  4314. layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
  4315. layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
  4316. }
  4317. }
  4318. }
  4319. break;
  4320. case LLM_ARCH_NEMOTRON:
  4321. {
  4322. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4323. // output
  4324. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4325. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4326. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4327. for (int i = 0; i < n_layer; ++i) {
  4328. auto & layer = layers[i];
  4329. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4330. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  4331. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  4332. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  4333. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  4334. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4335. // optional bias tensors
  4336. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4337. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4338. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4339. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4340. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4341. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  4342. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4343. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4344. // optional MLP bias
  4345. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4346. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  4347. }
  4348. } break;
  4349. case LLM_ARCH_NEMOTRON_H:
  4350. {
  4351. // mamba2 Mixer SSM params
  4352. // NOTE: int64_t for tensor dimensions
  4353. const int64_t d_conv = hparams.ssm_d_conv;
  4354. const int64_t d_inner = hparams.ssm_d_inner;
  4355. const int64_t d_state = hparams.ssm_d_state;
  4356. const int64_t n_ssm_head = hparams.ssm_dt_rank;
  4357. const int64_t n_group = hparams.ssm_n_group;
  4358. const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
  4359. // embeddings
  4360. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4361. // output
  4362. {
  4363. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4364. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4365. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  4366. if (output == NULL) {
  4367. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4368. }
  4369. }
  4370. for (int i = 0; i < n_layer; ++i) {
  4371. auto & layer = layers[i];
  4372. // all blocks use the attn norm
  4373. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4374. if (hparams.is_recurrent(i)) {
  4375. // ssm layers
  4376. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
  4377. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
  4378. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
  4379. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
  4380. // no "weight" suffix for these
  4381. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
  4382. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
  4383. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
  4384. // out_proj
  4385. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  4386. } else if (hparams.n_ff(i) == 0) {
  4387. // attention layers (with optional bias)
  4388. const int64_t n_head_i = hparams.n_head(i);
  4389. const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
  4390. const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
  4391. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
  4392. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
  4393. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
  4394. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
  4395. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4396. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
  4397. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
  4398. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4399. } else {
  4400. // mlp layers
  4401. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
  4402. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
  4403. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4404. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
  4405. }
  4406. }
  4407. } break;
  4408. case LLM_ARCH_EXAONE:
  4409. {
  4410. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4411. // output
  4412. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4413. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4414. // if output is NULL, init from the input tok embed
  4415. if (output == NULL) {
  4416. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4417. }
  4418. for (int i = 0; i < n_layer; ++i) {
  4419. auto & layer = layers[i];
  4420. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4421. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4422. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4423. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4424. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4425. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4426. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4427. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4428. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4429. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4430. }
  4431. } break;
  4432. case LLM_ARCH_EXAONE4:
  4433. {
  4434. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4435. // output
  4436. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4437. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4438. // if output is NULL, init from the input tok embed
  4439. if (output == NULL) {
  4440. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4441. }
  4442. for (int i = 0; i < n_layer; ++i) {
  4443. auto & layer = layers[i];
  4444. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4445. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4446. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4447. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4448. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4449. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  4450. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4451. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4452. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4453. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4454. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4455. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  4456. }
  4457. } break;
  4458. case LLM_ARCH_RWKV6:
  4459. {
  4460. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4461. // Block 0, LN0
  4462. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  4463. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  4464. // output
  4465. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4466. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4467. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4468. const int time_mix_extra_dim = hparams.time_mix_extra_dim;
  4469. const int time_decay_extra_dim = hparams.time_decay_extra_dim;
  4470. const int head_size = hparams.wkv_head_size;
  4471. const int attn_hidden_size = n_embd;
  4472. const int ffn_size = hparams.n_ff_arr[0];
  4473. for (int i = 0; i < n_layer; ++i) {
  4474. auto & layer = layers[i];
  4475. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4476. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  4477. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
  4478. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
  4479. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
  4480. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
  4481. layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
  4482. layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4483. layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4484. layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4485. layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4486. layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  4487. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED);
  4488. GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
  4489. layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
  4490. layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
  4491. layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
  4492. layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
  4493. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  4494. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4495. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4496. layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4497. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
  4498. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
  4499. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4500. layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
  4501. layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
  4502. layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
  4503. layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
  4504. layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
  4505. }
  4506. } break;
  4507. case LLM_ARCH_RWKV6QWEN2:
  4508. {
  4509. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4510. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4511. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  4512. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4513. const int time_mix_extra_dim = hparams.time_mix_extra_dim;
  4514. const int time_decay_extra_dim = hparams.time_decay_extra_dim;
  4515. const int head_size = hparams.wkv_head_size;
  4516. const int attn_hidden_size = n_embd;
  4517. const int n_head_kv = hparams.n_head_kv();
  4518. int attn_key_value_size;
  4519. if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
  4520. attn_key_value_size = attn_hidden_size;
  4521. } else {
  4522. attn_key_value_size = n_head_kv * head_size;
  4523. }
  4524. for (int i = 0; i < n_layer; ++i) {
  4525. auto & layer = layers[i];
  4526. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4527. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
  4528. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
  4529. layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
  4530. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
  4531. layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, TENSOR_NOT_REQUIRED);
  4532. layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
  4533. layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
  4534. layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
  4535. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
  4536. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
  4537. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4538. layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4539. // optional bias tensors
  4540. layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
  4541. layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
  4542. layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, TENSOR_NOT_REQUIRED);
  4543. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4544. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4545. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4546. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4547. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4548. }
  4549. } break;
  4550. case LLM_ARCH_RWKV7:
  4551. {
  4552. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4553. // Block 0, LN0
  4554. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  4555. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  4556. // output
  4557. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4558. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4559. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4560. const int n_lora_decay = hparams.n_lora_decay;
  4561. const int n_lora_iclr = hparams.n_lora_iclr;
  4562. const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
  4563. const int n_lora_gate = hparams.n_lora_gate;
  4564. const int attn_hidden_size = n_embd;
  4565. const int ffn_size = hparams.n_ff_arr[0];
  4566. for (int i = 0; i < n_layer; ++i) {
  4567. auto & layer = layers[i];
  4568. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4569. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  4570. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
  4571. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
  4572. layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
  4573. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
  4574. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
  4575. layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
  4576. layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4577. layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4578. if (i == 0) {
  4579. // actually not used
  4580. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4581. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4582. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4583. } else {
  4584. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4585. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
  4586. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
  4587. }
  4588. layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, 0);
  4589. layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, 0);
  4590. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
  4591. layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
  4592. layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
  4593. layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
  4594. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  4595. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4596. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4597. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
  4598. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
  4599. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4600. layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
  4601. layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
  4602. layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
  4603. }
  4604. } break;
  4605. case LLM_ARCH_ARWKV7:
  4606. {
  4607. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4608. // output
  4609. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4610. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4611. const int n_lora_decay = hparams.n_lora_decay;
  4612. const int n_lora_iclr = hparams.n_lora_iclr;
  4613. const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
  4614. const int n_lora_gate = hparams.n_lora_gate;
  4615. const int attn_hidden_size = n_embd;
  4616. for (int i = 0; i < n_layer; ++i) {
  4617. auto & layer = layers[i];
  4618. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4619. layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
  4620. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
  4621. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
  4622. layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
  4623. layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4624. layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4625. if (i == 0) {
  4626. // actually not used
  4627. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4628. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
  4629. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
  4630. } else {
  4631. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  4632. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
  4633. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
  4634. }
  4635. layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, TENSOR_NOT_REQUIRED);
  4636. layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, TENSOR_NOT_REQUIRED);
  4637. try {
  4638. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
  4639. } catch(std::runtime_error & e) {
  4640. // ARWKV models may not have gate tensors
  4641. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
  4642. }
  4643. layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
  4644. layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
  4645. layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
  4646. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  4647. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4648. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  4649. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4650. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4651. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  4652. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4653. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4654. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4655. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4656. }
  4657. } break;
  4658. case LLM_ARCH_CHAMELEON:
  4659. {
  4660. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4661. // output
  4662. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4663. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4664. // if output is NULL, init from the input tok embed
  4665. if (output == NULL) {
  4666. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4667. }
  4668. for (int i = 0; i < n_layer; ++i) {
  4669. auto & layer = layers[i];
  4670. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4671. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
  4672. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
  4673. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
  4674. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
  4675. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  4676. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  4677. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  4678. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4679. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4680. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4681. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4682. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4683. }
  4684. } break;
  4685. case LLM_ARCH_WAVTOKENIZER_DEC:
  4686. {
  4687. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
  4688. conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
  4689. conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
  4690. // posnet
  4691. {
  4692. const int64_t n_embd = hparams.posnet.n_embd;
  4693. for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
  4694. auto & layer = layers[i].posnet;
  4695. // posnet:
  4696. //
  4697. // - resnet
  4698. // - resnet
  4699. // - attn
  4700. // - resnet
  4701. // - resnet
  4702. // - norm
  4703. //
  4704. switch (i) {
  4705. case 0:
  4706. case 1:
  4707. case 3:
  4708. case 4:
  4709. {
  4710. layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
  4711. layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
  4712. layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
  4713. layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
  4714. layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
  4715. layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
  4716. layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
  4717. layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
  4718. } break;
  4719. case 2:
  4720. {
  4721. layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
  4722. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
  4723. layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
  4724. layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
  4725. layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
  4726. layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
  4727. layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
  4728. layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
  4729. layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
  4730. layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
  4731. } break;
  4732. case 5:
  4733. {
  4734. layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
  4735. layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
  4736. } break;
  4737. default: GGML_ABORT("unknown posnet layer");
  4738. };
  4739. }
  4740. }
  4741. GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
  4742. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
  4743. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
  4744. // convnext
  4745. {
  4746. const int64_t n_embd = hparams.convnext.n_embd;
  4747. for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
  4748. auto & layer = layers[i].convnext;
  4749. layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
  4750. layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
  4751. layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
  4752. layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
  4753. layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
  4754. layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
  4755. layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
  4756. layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
  4757. layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
  4758. }
  4759. // output
  4760. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4761. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4762. }
  4763. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
  4764. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
  4765. } break;
  4766. case LLM_ARCH_BAILINGMOE:
  4767. {
  4768. const int64_t n_ff_exp = hparams.n_ff_exp;
  4769. const int64_t n_expert_shared = hparams.n_expert_shared;
  4770. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4771. // output
  4772. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4773. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4774. for (int i = 0; i < n_layer; ++i) {
  4775. auto & layer = layers[i];
  4776. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4777. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
  4778. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  4779. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  4780. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
  4781. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4782. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4783. if (n_expert == 0) {
  4784. throw std::runtime_error("n_expert must be > 0");
  4785. }
  4786. if (n_expert_used == 0) {
  4787. throw std::runtime_error("n_expert_used must be > 0");
  4788. }
  4789. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4790. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  4791. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4792. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4793. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  4794. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4795. }
  4796. } break;
  4797. case LLM_ARCH_BAILINGMOE2:
  4798. {
  4799. const int64_t n_ff_exp = hparams.n_ff_exp;
  4800. const int64_t n_expert_shared = hparams.n_expert_shared;
  4801. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4802. // output
  4803. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4804. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4805. GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2");
  4806. GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2");
  4807. for (int i = 0; i < n_layer; ++i) {
  4808. int flags = 0;
  4809. if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
  4810. // skip all tensors in the NextN layers
  4811. flags |= TENSOR_SKIP;
  4812. }
  4813. auto & layer = layers[i];
  4814. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
  4815. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags);
  4816. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
  4817. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
  4818. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
  4819. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
  4820. if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
  4821. const int64_t n_ff_shexp = (hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp) * n_expert_shared;
  4822. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
  4823. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
  4824. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
  4825. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
  4826. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
  4827. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
  4828. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
  4829. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
  4830. } else { // Dense layers
  4831. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
  4832. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
  4833. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
  4834. }
  4835. // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
  4836. if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
  4837. layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
  4838. layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
  4839. layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
  4840. layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
  4841. layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
  4842. layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED | flags);
  4843. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, flags);
  4844. }
  4845. }
  4846. } break;
  4847. case LLM_ARCH_DOTS1:
  4848. {
  4849. const int64_t n_ff_exp = hparams.n_ff_exp;
  4850. const int64_t n_expert_shared = hparams.n_expert_shared;
  4851. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4852. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4853. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4854. for (int i = 0; i < n_layer; ++i) {
  4855. auto & layer = layers[i];
  4856. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4857. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4858. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4859. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4860. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4861. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4862. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4863. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4864. if (i < (int) hparams.n_layer_dense_lead) {
  4865. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4866. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4867. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4868. } else {
  4869. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4870. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  4871. if (n_expert == 0) {
  4872. throw std::runtime_error("n_expert must be > 0");
  4873. }
  4874. if (n_expert_used == 0) {
  4875. throw std::runtime_error("n_expert_used must be > 0");
  4876. }
  4877. // MoE branch
  4878. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4879. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  4880. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4881. // Shared expert branch
  4882. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4883. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  4884. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4885. }
  4886. }
  4887. } break;
  4888. case LLM_ARCH_ARCEE:
  4889. {
  4890. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4891. // output
  4892. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4893. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4894. // if output is NULL, init from the input tok embed
  4895. if (output == NULL) {
  4896. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4897. }
  4898. for (int i = 0; i < n_layer; ++i) {
  4899. auto & layer = layers[i];
  4900. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4901. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4902. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4903. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4904. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4905. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4906. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4907. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4908. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4909. }
  4910. } break;
  4911. case LLM_ARCH_AFMOE:
  4912. {
  4913. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4914. // output
  4915. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4916. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4917. // if output is NULL, init from the input tok embed
  4918. if (output == NULL) {
  4919. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4920. }
  4921. const int64_t n_ff_exp = hparams.n_ff_exp;
  4922. const int64_t n_expert_shared = hparams.n_expert_shared;
  4923. for (int i = 0; i < n_layer; ++i) {
  4924. auto & layer = layers[i];
  4925. // dual attention normalization
  4926. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4927. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  4928. // attention projections
  4929. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4930. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4931. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4932. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4933. // Q/K normalization
  4934. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4935. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4936. // attention gating
  4937. layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4938. // dual ffn normalization
  4939. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4940. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  4941. if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) {
  4942. // MoE layers
  4943. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4944. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
  4945. // grouped expert weights
  4946. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  4947. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  4948. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  4949. // shared expert
  4950. if (n_expert_shared > 0) {
  4951. const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
  4952. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
  4953. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
  4954. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
  4955. }
  4956. } else {
  4957. // Dense layers
  4958. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4959. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  4960. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4961. }
  4962. }
  4963. } break;
  4964. case LLM_ARCH_ERNIE4_5:
  4965. case LLM_ARCH_ERNIE4_5_MOE:
  4966. {
  4967. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4968. // output
  4969. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4970. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4971. // if output is NULL, init from the input tok embed
  4972. if (output == NULL) {
  4973. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4974. }
  4975. for (int i = 0; i < n_layer; ++i) {
  4976. auto & layer = layers[i];
  4977. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4978. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4979. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  4980. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  4981. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4982. // optional bias tensors
  4983. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4984. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4985. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4986. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4987. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4988. if (arch == LLM_ARCH_ERNIE4_5_MOE && static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
  4989. int n_ff_exp = hparams.n_ff_exp;
  4990. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4991. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  4992. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
  4993. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
  4994. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  4995. // Shared expert (if present)
  4996. if (hparams.n_ff_shexp > 0) {
  4997. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
  4998. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd }, 0);
  4999. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
  5000. }
  5001. } else { // Dense layers
  5002. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  5003. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  5004. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  5005. }
  5006. }
  5007. } break;
  5008. case LLM_ARCH_FALCON_H1:
  5009. {
  5010. // Common
  5011. const int64_t hidden_size = hparams.n_embd; // hidden_size
  5012. // mamba2 Mixer SSM params
  5013. const int64_t ssm_conv_kernel_size = hparams.ssm_d_conv; // ssm_conv_kernel_size
  5014. const int64_t ssm_n_groups = hparams.ssm_n_group; // ssm_n_groups
  5015. const int64_t ssm_state_size = hparams.ssm_d_state; // ssm_state_size
  5016. const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand
  5017. const int64_t ssm_num_heads = hparams.ssm_dt_rank; // ssm_num_heads
  5018. const int64_t ssm_conv_dim = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size;
  5019. const int64_t ssm_projection_size = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads;
  5020. // attn params
  5021. const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head
  5022. const int64_t attn_num_key_value_head = hparams.n_head_kv(0);
  5023. // ffn params
  5024. const int64_t ffn_intermediate_size = hparams.n_ff(0);
  5025. // embeddings
  5026. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0);
  5027. // output
  5028. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED);
  5029. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0);
  5030. // if output is NULL, init from the input tok embed
  5031. if (output == NULL) {
  5032. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED);
  5033. }
  5034. for (int i = 0; i < n_layer; ++i) {
  5035. auto & layer = layers[i];
  5036. /*SSM LAYERS*/
  5037. // ssm in
  5038. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0);
  5039. // ssm 1d conv
  5040. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0);
  5041. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED);
  5042. // ssm_dt
  5043. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0);
  5044. // no "weight" suffix for these
  5045. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0);
  5046. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0);
  5047. // ssm_norm
  5048. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED);
  5049. // out_proj
  5050. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0);
  5051. /*ATTENTION LAYERS*/
  5052. // attention layers (with optional bias)
  5053. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {hidden_size, n_embd_head_k * attn_num_attention_head}, 0);
  5054. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0);
  5055. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0);
  5056. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
  5057. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
  5058. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {attn_num_key_value_head * n_embd_head_k}, TENSOR_NOT_REQUIRED);
  5059. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {attn_num_key_value_head * n_embd_head_v}, TENSOR_NOT_REQUIRED);
  5060. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
  5061. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
  5062. // feed forward (w/ optional biases)
  5063. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0);
  5064. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  5065. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
  5066. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_intermediate_size, hidden_size}, 0);
  5067. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
  5068. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
  5069. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
  5070. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
  5071. }
  5072. } break;
  5073. case LLM_ARCH_HUNYUAN_MOE:
  5074. {
  5075. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  5076. // output
  5077. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  5078. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  5079. // if output is NULL, init from the input tok embed
  5080. if (output == NULL) {
  5081. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  5082. }
  5083. for (int i = 0; i < n_layer; ++i) {
  5084. auto & layer = layers[i];
  5085. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  5086. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  5087. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  5088. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  5089. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  5090. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  5091. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  5092. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  5093. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  5094. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  5095. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  5096. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  5097. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  5098. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  5099. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
  5100. }
  5101. } break;
  5102. case LLM_ARCH_HUNYUAN_DENSE:
  5103. {
  5104. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  5105. // output
  5106. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  5107. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  5108. // if output is NULL, init from the input tok embed
  5109. if (output == NULL) {
  5110. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  5111. }
  5112. for (int i = 0; i < n_layer; ++i) {
  5113. auto & layer = layers[i];
  5114. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  5115. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  5116. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  5117. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  5118. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  5119. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  5120. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  5121. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  5122. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  5123. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  5124. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  5125. }
  5126. } break;
  5127. case LLM_ARCH_SMOLLM3:
  5128. {
  5129. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  5130. // output
  5131. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  5132. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  5133. // if output is NULL, init from the input tok embed
  5134. if (output == NULL) {
  5135. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  5136. }
  5137. for (int i = 0; i < n_layer; ++i) {
  5138. auto & layer = layers[i];
  5139. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  5140. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  5141. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  5142. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  5143. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  5144. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  5145. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  5146. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  5147. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  5148. }
  5149. } break;
  5150. case LLM_ARCH_OPENAI_MOE:
  5151. {
  5152. const int64_t n_ff_exp = hparams.n_ff_exp;
  5153. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  5154. // output
  5155. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  5156. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  5157. for (int i = 0; i < n_layer; ++i) {
  5158. auto & layer = layers[i];
  5159. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  5160. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  5161. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
  5162. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  5163. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  5164. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
  5165. layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
  5166. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
  5167. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  5168. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  5169. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  5170. // bias
  5171. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_head * n_rot}, 0);
  5172. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_head_kv * n_rot}, 0);
  5173. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_head_kv * n_rot}, 0);
  5174. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  5175. layer.ffn_gate_inp_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0);
  5176. layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
  5177. layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), { n_embd, n_expert}, 0);
  5178. layer.ffn_up_exps_b = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
  5179. }
  5180. } break;
  5181. case LLM_ARCH_LFM2:
  5182. case LLM_ARCH_LFM2MOE:
  5183. {
  5184. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  5185. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  5186. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  5187. if (output == NULL) {
  5188. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  5189. }
  5190. for (int i = 0; i < n_layer; ++i) {
  5191. auto & layer = layers[i];
  5192. const bool is_moe_layer = i >= static_cast<int>(hparams.n_layer_dense_lead);
  5193. // ffn/moe is same for transformer and conv layers
  5194. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  5195. if (is_moe_layer) {
  5196. GGML_ASSERT(n_expert && n_expert_used);
  5197. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  5198. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
  5199. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {hparams.n_ff_exp, n_embd, n_expert}, 0);
  5200. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
  5201. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
  5202. } else { // dense
  5203. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  5204. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  5205. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  5206. }
  5207. // for operator_norm
  5208. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  5209. if (!hparams.is_recurrent(i)) {
  5210. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  5211. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  5212. GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
  5213. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  5214. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, hparams.n_embd_k_gqa(i)}, 0);
  5215. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, hparams.n_embd_v_gqa(i)}, 0);
  5216. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  5217. } else {
  5218. layer.shortconv.conv = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV, "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0);
  5219. layer.shortconv.in_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ, "weight", i), {n_embd, 3 * n_embd}, 0);
  5220. layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
  5221. }
  5222. }
  5223. } break;
  5224. case LLM_ARCH_SMALLTHINKER:
  5225. {
  5226. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  5227. // output
  5228. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  5229. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  5230. // if output is NULL, init from the input tok embed
  5231. if (output == NULL) {
  5232. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  5233. }
  5234. for (int i = 0; i < n_layer; ++i) {
  5235. auto & layer = layers[i];
  5236. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  5237. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
  5238. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
  5239. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
  5240. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
  5241. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  5242. GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for SMALLTHINKER");
  5243. GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for SMALLTHINKER");
  5244. // MoE branch
  5245. const int64_t n_ff_exp = hparams.n_ff_exp;
  5246. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
  5247. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
  5248. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
  5249. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
  5250. }
  5251. } break;
  5252. case LLM_ARCH_GROVEMOE:
  5253. {
  5254. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  5255. // output
  5256. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  5257. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  5258. // if output is NULL, init from the input tok embed
  5259. if (output == NULL) {
  5260. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  5261. }
  5262. GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for GROVEMOE");
  5263. GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for GROVEMOE");
  5264. GGML_ASSERT(hparams.n_group_experts > 0 && "n_group_experts must be > 0 for GROVEMOE");
  5265. for (int i = 0; i < n_layer; ++i) {
  5266. auto & layer = layers[i];
  5267. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  5268. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  5269. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  5270. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  5271. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  5272. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  5273. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  5274. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  5275. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  5276. // MoE branch
  5277. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  5278. const int64_t n_ff_chexp = hparams.n_ff_chexp ? hparams.n_ff_chexp : n_embd_head_k;
  5279. const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
  5280. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  5281. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  5282. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  5283. layer.ffn_gate_chexps = create_tensor(tn(LLM_TENSOR_FFN_GATE_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0);
  5284. layer.ffn_down_chexps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_CHEXPS, "weight", i), {n_ff_chexp, n_embd, n_chunk_expert}, 0);
  5285. layer.ffn_up_chexps = create_tensor(tn(LLM_TENSOR_FFN_UP_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0);
  5286. }
  5287. } break;
  5288. case LLM_ARCH_APERTUS:
  5289. {
  5290. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  5291. // output
  5292. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  5293. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
  5294. for (int i = 0; i < n_layer; ++i) {
  5295. auto & layer = layers[i];
  5296. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  5297. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  5298. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  5299. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  5300. } else {
  5301. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  5302. }
  5303. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
  5304. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
  5305. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
  5306. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
  5307. // optional bias tensors
  5308. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
  5309. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
  5310. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
  5311. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
  5312. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  5313. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  5314. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
  5315. // Q and K layernorms for Apertus
  5316. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
  5317. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
  5318. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
  5319. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
  5320. }
  5321. } break;
  5322. case LLM_ARCH_MINIMAX_M2:
  5323. {
  5324. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  5325. // output
  5326. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  5327. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  5328. for (int i = 0; i < n_layer; ++i) {
  5329. auto & layer = layers[i];
  5330. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
  5331. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
  5332. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
  5333. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
  5334. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  5335. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k * n_head}, 0);
  5336. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_k_gqa}, 0);
  5337. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  5338. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  5339. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  5340. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  5341. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  5342. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
  5343. }
  5344. } break;
  5345. case LLM_ARCH_COGVLM:
  5346. {
  5347. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  5348. // output
  5349. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  5350. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  5351. // if output is NULL, init from the input tok embed
  5352. if (output == NULL) {
  5353. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  5354. }
  5355. for (int i = 0; i < n_layer; ++i) {
  5356. auto & layer = layers[i];
  5357. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  5358. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
  5359. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  5360. layer.visexp_attn_wqkv = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
  5361. layer.visexp_attn_wo = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  5362. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  5363. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  5364. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  5365. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  5366. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  5367. layer.visexp_ffn_gate = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  5368. layer.visexp_ffn_down = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  5369. layer.visexp_ffn_up = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  5370. }
  5371. } break;
  5372. case LLM_ARCH_PANGU_EMBED:
  5373. {
  5374. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  5375. // output
  5376. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  5377. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  5378. // if output is NULL, init from the input tok embed
  5379. if (output == NULL) {
  5380. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  5381. }
  5382. for (int i = 0; i < n_layer; ++i) {
  5383. auto & layer = layers[i];
  5384. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  5385. // weight tensors
  5386. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  5387. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  5388. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  5389. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  5390. // bias tensors
  5391. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd_head_k * n_head}, 0);
  5392. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  5393. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  5394. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  5395. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  5396. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  5397. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  5398. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  5399. } else {
  5400. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  5401. }
  5402. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  5403. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  5404. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  5405. }
  5406. } break;
  5407. case LLM_ARCH_QWEN3NEXT:
  5408. {
  5409. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  5410. // output
  5411. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  5412. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
  5413. // if output is NULL, init from the input tok embed
  5414. if (output == NULL) {
  5415. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
  5416. }
  5417. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  5418. // Calculate dimensions from hyperparameters
  5419. const int64_t head_k_dim = hparams.ssm_d_state;
  5420. const int64_t head_v_dim = hparams.ssm_d_state;
  5421. const int64_t n_k_heads = hparams.ssm_n_group;
  5422. const int64_t n_v_heads = hparams.ssm_dt_rank;
  5423. const int64_t key_dim = head_k_dim * n_k_heads;
  5424. const int64_t value_dim = head_v_dim * n_v_heads;
  5425. const int64_t conv_dim = key_dim * 2 + value_dim;
  5426. // Calculate projection sizes
  5427. const int64_t qkvz_dim = key_dim * 2 + value_dim * 2;
  5428. const int64_t ba_dim = n_v_heads * 2;
  5429. for (int i = 0; i < n_layer; ++i) {
  5430. auto & layer = layers[i];
  5431. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  5432. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
  5433. if (!hparams.is_recurrent(i)) {
  5434. // Attention layers
  5435. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
  5436. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
  5437. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
  5438. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
  5439. // Q/K normalization for attention layers
  5440. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
  5441. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
  5442. } else {
  5443. // Linear attention (gated delta net) specific tensors
  5444. // Create tensors with calculated dimensions
  5445. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, 0);
  5446. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
  5447. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
  5448. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
  5449. layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
  5450. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
  5451. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
  5452. }
  5453. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
  5454. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
  5455. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
  5456. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
  5457. // Shared experts
  5458. layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
  5459. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
  5460. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
  5461. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
  5462. }
  5463. } break;
  5464. default:
  5465. throw std::runtime_error("unknown architecture");
  5466. }
  5467. if (n_moved_tensors > 0) {
  5468. LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
  5469. __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
  5470. ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
  5471. }
  5472. }
  5473. ml.done_getting_tensors();
  5474. ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
  5475. pimpl->mappings.reserve(ml.mappings.size());
  5476. // create the backend buffers
  5477. std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps;
  5478. ctx_buf_maps.reserve(ctx_map.size());
  5479. // Ensure we have enough capacity for the maximum backend buffer we will potentially create
  5480. const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
  5481. pimpl->ctxs_bufs.reserve(n_max_backend_buffer);
  5482. for (auto & [buft, ctx_ptr] : ctx_map) {
  5483. ggml_context * ctx = ctx_ptr.get();
  5484. // skip contexts without tensors
  5485. if (ggml_get_first_tensor(ctx) == nullptr) {
  5486. continue;
  5487. }
  5488. llama_buf_map buf_map;
  5489. buf_map.reserve(n_max_backend_buffer);
  5490. // check if it is possible to use buffer_from_host_ptr with this buffer type
  5491. ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
  5492. if (!dev) {
  5493. // FIXME: workaround for CPU backend buft having a NULL device
  5494. dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  5495. if (!dev) {
  5496. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  5497. }
  5498. }
  5499. ggml_backend_dev_props props;
  5500. ggml_backend_dev_get_props(dev, &props);
  5501. bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
  5502. bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
  5503. std::vector<ggml_backend_buffer_ptr> bufs;
  5504. if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
  5505. GGML_ASSERT(!ml.no_alloc);
  5506. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  5507. // only the mmap region containing the tensors in the model is mapped to the backend buffer
  5508. // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer,
  5509. // then we could just use metal for all layers
  5510. // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
  5511. void * addr = nullptr;
  5512. size_t first, last; // NOLINT
  5513. ml.get_mapping_range(&first, &last, &addr, idx, ctx);
  5514. if (first >= last) {
  5515. continue;
  5516. }
  5517. const size_t max_size = ggml_get_max_tensor_size(ctx);
  5518. ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
  5519. if (buf == nullptr) {
  5520. throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
  5521. }
  5522. bufs.emplace_back(buf);
  5523. buf_map.emplace(idx, buf);
  5524. }
  5525. } else {
  5526. ggml_backend_buffer_t buf;
  5527. if (ml.no_alloc) {
  5528. buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
  5529. for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
  5530. t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them
  5531. }
  5532. } else {
  5533. buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
  5534. }
  5535. if (buf == nullptr) {
  5536. throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
  5537. }
  5538. if (use_mlock && ggml_backend_buffer_is_host(buf)) {
  5539. pimpl->mlock_bufs.emplace_back(new llama_mlock);
  5540. auto & mlock_buf = pimpl->mlock_bufs.back();
  5541. mlock_buf->init (ggml_backend_buffer_get_base(buf));
  5542. mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
  5543. }
  5544. bufs.emplace_back(buf);
  5545. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  5546. buf_map.emplace(idx, buf);
  5547. }
  5548. }
  5549. pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), std::move(bufs));
  5550. for (auto & buf : buf_map) {
  5551. // indicate that this buffer contains weights
  5552. // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
  5553. ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
  5554. }
  5555. ctx_buf_maps.emplace_back(ctx, buf_map);
  5556. }
  5557. if (llama_supports_gpu_offload()) {
  5558. const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
  5559. LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
  5560. if (n_gpu_layers > (int) hparams.n_layer) {
  5561. LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
  5562. }
  5563. const int max_backend_supported_layers = hparams.n_layer + 1;
  5564. const int max_offloadable_layers = hparams.n_layer + 1;
  5565. LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
  5566. }
  5567. // print memory requirements per buffer type
  5568. for (auto & [_, bufs] : pimpl->ctxs_bufs) {
  5569. for (auto & buf: bufs) {
  5570. LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n",
  5571. __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
  5572. }
  5573. }
  5574. // populate tensors_by_name
  5575. for (auto & [ctx, _] : pimpl->ctxs_bufs) {
  5576. for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
  5577. tensors_by_name.emplace_back(ggml_get_name(cur), cur);
  5578. }
  5579. }
  5580. if (ml.no_alloc) {
  5581. return true;
  5582. }
  5583. // load tensor data
  5584. for (auto & [ctx, buf_map] : ctx_buf_maps) {
  5585. if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
  5586. return false;
  5587. }
  5588. }
  5589. if (use_mmap_buffer) {
  5590. for (auto & mapping : ml.mappings) {
  5591. pimpl->mappings.emplace_back(std::move(mapping));
  5592. }
  5593. }
  5594. return true;
  5595. }
  5596. std::string llama_model::arch_name() const {
  5597. return llm_arch_name(arch);
  5598. }
  5599. std::string llama_model::type_name() const {
  5600. return llm_type_name(type);
  5601. }
  5602. std::string llama_model::desc() const {
  5603. return pimpl->desc_str;
  5604. }
  5605. size_t llama_model::size() const {
  5606. return pimpl->n_bytes;
  5607. }
  5608. size_t llama_model::n_tensors() const {
  5609. return tensors_by_name.size();
  5610. }
  5611. size_t llama_model::n_devices() const {
  5612. return devices.size();
  5613. }
  5614. std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
  5615. std::map<ggml_backend_buffer_type_t, size_t> ret;
  5616. for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
  5617. if (hparams.no_alloc) {
  5618. GGML_ASSERT(bufs.size() == 1);
  5619. ggml_backend_buffer_t buf = bufs[0].get();
  5620. GGML_ASSERT(ggml_backend_buffer_get_base(buf) == nullptr);
  5621. ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf);
  5622. ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
  5623. } else {
  5624. for (const auto & buf : bufs) {
  5625. // GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
  5626. ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
  5627. }
  5628. }
  5629. }
  5630. return ret;
  5631. }
  5632. uint64_t llama_model::n_elements() const {
  5633. return pimpl->n_elements;
  5634. }
  5635. void llama_model::print_info() const {
  5636. const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
  5637. auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
  5638. bool is_var = false;
  5639. std::vector<uint32_t> v;
  5640. for (uint32_t i = 0; i < n; ++i) {
  5641. v.push_back(f(i));
  5642. if (v[i] != v[0]) {
  5643. is_var = true;
  5644. }
  5645. }
  5646. std::stringstream ss;
  5647. if (is_var) {
  5648. ss << "[";
  5649. for (uint32_t i = 0; i < n; ++i) {
  5650. ss << v[i];
  5651. if (i < n - 1) {
  5652. ss << ", ";
  5653. }
  5654. }
  5655. ss << "]";
  5656. } else {
  5657. ss << v[0];
  5658. }
  5659. return ss.str();
  5660. };
  5661. // hparams
  5662. LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
  5663. LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
  5664. LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
  5665. if (!hparams.vocab_only) {
  5666. LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
  5667. LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
  5668. LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp());
  5669. LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
  5670. LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
  5671. LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
  5672. LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
  5673. LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
  5674. LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
  5675. LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
  5676. LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
  5677. LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
  5678. LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
  5679. LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
  5680. LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
  5681. LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
  5682. LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
  5683. LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
  5684. LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
  5685. LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
  5686. LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
  5687. LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
  5688. LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
  5689. LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
  5690. LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
  5691. LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
  5692. LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
  5693. LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
  5694. LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
  5695. LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
  5696. LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
  5697. LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
  5698. LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
  5699. LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
  5700. // MRoPE (Multi-axis Rotary Position Embedding) sections
  5701. if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
  5702. LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
  5703. }
  5704. if (!classifier_labels.empty()) {
  5705. LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
  5706. size_t i = 0;
  5707. for (auto label : classifier_labels) {
  5708. LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
  5709. }
  5710. }
  5711. }
  5712. if (arch == LLM_ARCH_MAMBA ||
  5713. arch == LLM_ARCH_MAMBA2 ||
  5714. arch == LLM_ARCH_JAMBA ||
  5715. arch == LLM_ARCH_FALCON_H1 ||
  5716. arch == LLM_ARCH_PLAMO2 ||
  5717. arch == LLM_ARCH_GRANITE_HYBRID ||
  5718. arch == LLM_ARCH_QWEN3NEXT ||
  5719. arch == LLM_ARCH_NEMOTRON_H) {
  5720. LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
  5721. LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
  5722. LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
  5723. LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
  5724. LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
  5725. LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
  5726. }
  5727. LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
  5728. if (pimpl->n_elements >= 1e12) {
  5729. LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
  5730. } else if (pimpl->n_elements >= 1e9) {
  5731. LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
  5732. } else if (pimpl->n_elements >= 1e6) {
  5733. LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
  5734. } else {
  5735. LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
  5736. }
  5737. // general kv
  5738. LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
  5739. if (arch == LLM_ARCH_DEEPSEEK) {
  5740. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  5741. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5742. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  5743. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  5744. }
  5745. if (arch == LLM_ARCH_DEEPSEEK2) {
  5746. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  5747. LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
  5748. LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
  5749. LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
  5750. LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
  5751. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5752. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  5753. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  5754. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  5755. LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
  5756. }
  5757. if (arch == LLM_ARCH_QWEN2MOE) {
  5758. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5759. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  5760. }
  5761. if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
  5762. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5763. }
  5764. if (arch == LLM_ARCH_MINICPM ||
  5765. arch == LLM_ARCH_GRANITE ||
  5766. arch == LLM_ARCH_GRANITE_MOE ||
  5767. arch == LLM_ARCH_GRANITE_HYBRID) {
  5768. LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
  5769. LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
  5770. LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
  5771. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  5772. }
  5773. if (arch == LLM_ARCH_BAILINGMOE) {
  5774. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  5775. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5776. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  5777. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  5778. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  5779. }
  5780. if (arch == LLM_ARCH_BAILINGMOE2) {
  5781. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  5782. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5783. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  5784. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  5785. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  5786. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  5787. LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
  5788. LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
  5789. }
  5790. if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
  5791. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5792. LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
  5793. }
  5794. if (arch == LLM_ARCH_GROVEMOE) {
  5795. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5796. LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
  5797. LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
  5798. LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
  5799. }
  5800. vocab.print_info();
  5801. }
  5802. ggml_backend_dev_t llama_model::dev_layer(int il) const {
  5803. return pimpl->dev_layer.at(il).dev;
  5804. }
  5805. ggml_backend_dev_t llama_model::dev_output() const {
  5806. return pimpl->dev_output.dev;
  5807. }
  5808. template<typename F>
  5809. static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
  5810. ggml_init_params params = {
  5811. /*.mem_size =*/ ggml_tensor_overhead()*8,
  5812. /*.mem_buffer =*/ NULL,
  5813. /*.no_alloc =*/ true,
  5814. };
  5815. ggml_context_ptr ctx { ggml_init(params) };
  5816. if (!ctx) {
  5817. throw std::runtime_error(format("failed to create ggml context"));
  5818. }
  5819. ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
  5820. ggml_tensor * op_tensor = fn(ctx.get());
  5821. for (int i = 0; i < GGML_MAX_SRC; i++) {
  5822. if (op_tensor->src[i] != nullptr) {
  5823. assert(op_tensor->src[i]->buffer == nullptr);
  5824. op_tensor->src[i]->buffer = buf.get();
  5825. }
  5826. }
  5827. bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  5828. return op_supported;
  5829. }
  5830. template<typename F>
  5831. static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
  5832. for (const auto & cur : buft_list) {
  5833. ggml_backend_dev_t cur_dev = cur.first;
  5834. ggml_backend_buffer_type_t cur_buft = cur.second;
  5835. if (buft_supported(cur_buft, cur_dev, fn)) {
  5836. return cur_buft;
  5837. }
  5838. }
  5839. throw std::runtime_error(format("no suitable buffer type found"));
  5840. }
  5841. ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
  5842. return ::select_buft(
  5843. *pimpl->dev_layer.at(il).buft_list,
  5844. [&](ggml_context * ctx) {
  5845. ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  5846. ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  5847. return ggml_add(ctx, cur, layer_dir);
  5848. });
  5849. }
  5850. bool llama_model::has_tensor_overrides() const {
  5851. return pimpl->has_tensor_overrides;
  5852. }
  5853. const ggml_tensor * llama_model::get_tensor(const char * name) const {
  5854. auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
  5855. [name](const std::pair<std::string, ggml_tensor *> & it) {
  5856. return it.first == name;
  5857. });
  5858. if (it == tensors_by_name.end()) {
  5859. return nullptr;
  5860. }
  5861. return it->second;
  5862. }
  5863. float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
  5864. return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
  5865. }
  5866. float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
  5867. return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
  5868. }
  5869. ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
  5870. const uint32_t n_ctx_seq = cparams.n_ctx_seq;
  5871. // choose long/short freq factors based on the context size
  5872. if (layers[il].rope_freqs != nullptr) {
  5873. return layers[il].rope_freqs;
  5874. }
  5875. if (n_ctx_seq > hparams.n_ctx_orig_yarn) {
  5876. return layers[il].rope_long;
  5877. }
  5878. return layers[il].rope_short;
  5879. }
  5880. llama_memory_i * llama_model::create_memory(const llama_memory_params & params, const llama_cparams & cparams) const {
  5881. llama_memory_i * res;
  5882. switch (arch) {
  5883. // Models that need specific instantiation should be handled in the
  5884. // switch statement
  5885. case LLM_ARCH_BERT:
  5886. case LLM_ARCH_JINA_BERT_V2:
  5887. case LLM_ARCH_JINA_BERT_V3:
  5888. case LLM_ARCH_NOMIC_BERT:
  5889. case LLM_ARCH_NOMIC_BERT_MOE:
  5890. case LLM_ARCH_NEO_BERT:
  5891. case LLM_ARCH_WAVTOKENIZER_DEC:
  5892. case LLM_ARCH_GEMMA_EMBEDDING:
  5893. case LLM_ARCH_DREAM:
  5894. case LLM_ARCH_LLADA:
  5895. case LLM_ARCH_LLADA_MOE:
  5896. case LLM_ARCH_RND1:
  5897. {
  5898. res = nullptr;
  5899. } break;
  5900. // Models that need standard caching should rely on recurrent/hybrid
  5901. // checks
  5902. default:
  5903. {
  5904. if (llm_arch_is_recurrent(arch)) {
  5905. res = new llama_memory_recurrent(
  5906. *this,
  5907. GGML_TYPE_F32,
  5908. GGML_TYPE_F32,
  5909. cparams.offload_kqv,
  5910. std::max((uint32_t) 1, cparams.n_seq_max),
  5911. cparams.n_seq_max,
  5912. nullptr);
  5913. } else if (llm_arch_is_hybrid(arch)) {
  5914. // The main difference between hybrid architectures is the
  5915. // layer filters, so pick the right one here
  5916. llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
  5917. llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
  5918. if (arch == LLM_ARCH_FALCON_H1) {
  5919. filter_attn = [&](int32_t) { return true; };
  5920. filter_recr = [&](int32_t) { return true; };
  5921. } else if (arch == LLM_ARCH_NEMOTRON_H) {
  5922. filter_attn = [&](int32_t il) {
  5923. return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
  5924. };
  5925. filter_recr = [&](int32_t il) {
  5926. return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
  5927. };
  5928. }
  5929. res = new llama_memory_hybrid(
  5930. /* model */ *this,
  5931. /* attn_type_k */ params.type_k,
  5932. /* attn_type_v */ params.type_v,
  5933. /* attn_v_trans */ !cparams.flash_attn,
  5934. /* attn_kv_size */ cparams.n_ctx,
  5935. /* attn_n_pad */ 1,
  5936. /* attn_n_swa */ hparams.n_swa,
  5937. /* attn_swa_type */ hparams.swa_type,
  5938. /* recurrent_type_k */ GGML_TYPE_F32,
  5939. /* recurrent_type_v */ GGML_TYPE_F32,
  5940. /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
  5941. /* n_seq_max */ cparams.n_seq_max,
  5942. /* offload */ cparams.offload_kqv,
  5943. /* unified */ cparams.kv_unified,
  5944. /* filter_attn */ std::move(filter_attn),
  5945. /* filter_recr */ std::move(filter_recr));
  5946. } else {
  5947. llama_memory_i::layer_reuse_cb reuse = nullptr;
  5948. if (arch == LLM_ARCH_GEMMA3N) {
  5949. reuse = [&](int32_t il) {
  5950. if (il >= (int32_t) hparams.n_layer_kv_from_start) {
  5951. return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
  5952. }
  5953. return -1;
  5954. };
  5955. }
  5956. if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
  5957. GGML_ASSERT(hparams.is_swa_any());
  5958. res = new llama_kv_cache_iswa(
  5959. *this,
  5960. params.type_k,
  5961. params.type_v,
  5962. !cparams.flash_attn,
  5963. cparams.offload_kqv,
  5964. params.swa_full,
  5965. cparams.kv_unified,
  5966. cparams.n_ctx_seq,
  5967. cparams.n_seq_max,
  5968. cparams.n_ubatch,
  5969. 1,
  5970. nullptr,
  5971. reuse);
  5972. } else {
  5973. GGML_ASSERT(!hparams.is_swa_any());
  5974. res = new llama_kv_cache(
  5975. *this,
  5976. params.type_k,
  5977. params.type_v,
  5978. !cparams.flash_attn,
  5979. cparams.offload_kqv,
  5980. cparams.kv_unified,
  5981. cparams.n_ctx_seq,
  5982. cparams.n_seq_max,
  5983. 1,
  5984. hparams.n_swa,
  5985. hparams.swa_type,
  5986. nullptr,
  5987. nullptr);
  5988. }
  5989. }
  5990. }
  5991. }
  5992. return res;
  5993. }
  5994. ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
  5995. std::unique_ptr<llm_graph_context> llm;
  5996. switch (arch) {
  5997. case LLM_ARCH_LLAMA:
  5998. {
  5999. llm = std::make_unique<llm_build_llama>(*this, params);
  6000. } break;
  6001. case LLM_ARCH_LLAMA4:
  6002. {
  6003. if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
  6004. llm = std::make_unique<llm_build_llama>(*this, params);
  6005. } else {
  6006. llm = std::make_unique<llm_build_llama_iswa>(*this, params);
  6007. }
  6008. } break;
  6009. case LLM_ARCH_DECI:
  6010. {
  6011. llm = std::make_unique<llm_build_deci>(*this, params);
  6012. } break;
  6013. case LLM_ARCH_BAICHUAN:
  6014. {
  6015. llm = std::make_unique<llm_build_baichuan>(*this, params);
  6016. } break;
  6017. case LLM_ARCH_FALCON:
  6018. {
  6019. llm = std::make_unique<llm_build_falcon>(*this, params);
  6020. } break;
  6021. case LLM_ARCH_GROK:
  6022. {
  6023. llm = std::make_unique<llm_build_grok>(*this, params);
  6024. } break;
  6025. case LLM_ARCH_STARCODER:
  6026. {
  6027. llm = std::make_unique<llm_build_starcoder>(*this, params);
  6028. } break;
  6029. case LLM_ARCH_REFACT:
  6030. {
  6031. llm = std::make_unique<llm_build_refact>(*this, params);
  6032. } break;
  6033. case LLM_ARCH_BERT:
  6034. case LLM_ARCH_JINA_BERT_V2:
  6035. case LLM_ARCH_JINA_BERT_V3:
  6036. case LLM_ARCH_NOMIC_BERT:
  6037. case LLM_ARCH_NOMIC_BERT_MOE:
  6038. {
  6039. llm = std::make_unique<llm_build_bert>(*this, params);
  6040. } break;
  6041. case LLM_ARCH_NEO_BERT:
  6042. {
  6043. llm = std::make_unique<llm_build_neo_bert>(*this, params);
  6044. } break;
  6045. case LLM_ARCH_BLOOM:
  6046. {
  6047. llm = std::make_unique<llm_build_bloom>(*this, params);
  6048. } break;
  6049. case LLM_ARCH_MPT:
  6050. {
  6051. llm = std::make_unique<llm_build_mpt>(*this, params);
  6052. } break;
  6053. case LLM_ARCH_STABLELM:
  6054. {
  6055. llm = std::make_unique<llm_build_stablelm>(*this, params);
  6056. } break;
  6057. case LLM_ARCH_QWEN:
  6058. {
  6059. llm = std::make_unique<llm_build_qwen>(*this, params);
  6060. } break;
  6061. case LLM_ARCH_QWEN2:
  6062. {
  6063. llm = std::make_unique<llm_build_qwen2>(*this, params);
  6064. } break;
  6065. case LLM_ARCH_DREAM:
  6066. {
  6067. llm = std::make_unique<llm_build_dream>(*this, params);
  6068. }
  6069. break;
  6070. case LLM_ARCH_LLADA:
  6071. {
  6072. llm = std::make_unique<llm_build_llada>(*this, params);
  6073. }
  6074. break;
  6075. case LLM_ARCH_LLADA_MOE:
  6076. {
  6077. llm = std::make_unique<llm_build_llada_moe>(*this, params);
  6078. }
  6079. break;
  6080. case LLM_ARCH_RND1:
  6081. {
  6082. llm = std::make_unique<llm_build_rnd1>(*this, params);
  6083. }
  6084. break;
  6085. case LLM_ARCH_QWEN2VL:
  6086. {
  6087. llm = std::make_unique<llm_build_qwen2vl>(*this, params);
  6088. } break;
  6089. case LLM_ARCH_QWEN2MOE:
  6090. {
  6091. llm = std::make_unique<llm_build_qwen2moe>(*this, params);
  6092. } break;
  6093. case LLM_ARCH_QWEN3:
  6094. {
  6095. llm = std::make_unique<llm_build_qwen3>(*this, params);
  6096. } break;
  6097. case LLM_ARCH_QWEN3MOE:
  6098. {
  6099. llm = std::make_unique<llm_build_qwen3moe>(*this, params);
  6100. } break;
  6101. case LLM_ARCH_QWEN3VL:
  6102. {
  6103. llm = std::make_unique<llm_build_qwen3vl>(*this, params);
  6104. } break;
  6105. case LLM_ARCH_QWEN3VLMOE:
  6106. {
  6107. llm = std::make_unique<llm_build_qwen3vlmoe>(*this, params);
  6108. } break;
  6109. case LLM_ARCH_PHI2:
  6110. {
  6111. llm = std::make_unique<llm_build_phi2>(*this, params);
  6112. } break;
  6113. case LLM_ARCH_PHI3:
  6114. case LLM_ARCH_PHIMOE:
  6115. {
  6116. if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
  6117. llm = std::make_unique<llm_build_phi3<true>> (*this, params);
  6118. } else {
  6119. llm = std::make_unique<llm_build_phi3<false>>(*this, params);
  6120. }
  6121. } break;
  6122. case LLM_ARCH_PLAMO:
  6123. {
  6124. llm = std::make_unique<llm_build_plamo>(*this, params);
  6125. } break;
  6126. case LLM_ARCH_PLAMO2:
  6127. {
  6128. llm = std::make_unique<llm_build_plamo2>(*this, params);
  6129. } break;
  6130. case LLM_ARCH_GPT2:
  6131. {
  6132. llm = std::make_unique<llm_build_gpt2>(*this, params);
  6133. } break;
  6134. case LLM_ARCH_CODESHELL:
  6135. {
  6136. llm = std::make_unique<llm_build_codeshell>(*this, params);
  6137. } break;
  6138. case LLM_ARCH_ORION:
  6139. {
  6140. llm = std::make_unique<llm_build_orion>(*this, params);
  6141. } break;
  6142. case LLM_ARCH_INTERNLM2:
  6143. {
  6144. llm = std::make_unique<llm_build_internlm2>(*this, params);
  6145. } break;
  6146. case LLM_ARCH_MINICPM3:
  6147. {
  6148. llm = std::make_unique<llm_build_minicpm3>(*this, params);
  6149. } break;
  6150. case LLM_ARCH_GEMMA:
  6151. {
  6152. llm = std::make_unique<llm_build_gemma>(*this, params);
  6153. } break;
  6154. case LLM_ARCH_GEMMA2:
  6155. {
  6156. llm = std::make_unique<llm_build_gemma2_iswa>(*this, params);
  6157. } break;
  6158. case LLM_ARCH_GEMMA3:
  6159. {
  6160. if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
  6161. llm = std::make_unique<llm_build_gemma3<true>>(*this, params);
  6162. } else {
  6163. llm = std::make_unique<llm_build_gemma3<false>>(*this, params);
  6164. }
  6165. } break;
  6166. case LLM_ARCH_GEMMA3N:
  6167. {
  6168. llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
  6169. } break;
  6170. case LLM_ARCH_GEMMA_EMBEDDING:
  6171. {
  6172. llm = std::make_unique<llm_build_gemma_embedding>(*this, params);
  6173. } break;
  6174. case LLM_ARCH_STARCODER2:
  6175. {
  6176. llm = std::make_unique<llm_build_starcoder2>(*this, params);
  6177. } break;
  6178. case LLM_ARCH_MAMBA:
  6179. case LLM_ARCH_MAMBA2:
  6180. {
  6181. llm = std::make_unique<llm_build_mamba>(*this, params);
  6182. } break;
  6183. case LLM_ARCH_JAMBA:
  6184. {
  6185. llm = std::make_unique<llm_build_jamba>(*this, params);
  6186. } break;
  6187. case LLM_ARCH_XVERSE:
  6188. {
  6189. llm = std::make_unique<llm_build_xverse>(*this, params);
  6190. } break;
  6191. case LLM_ARCH_COMMAND_R:
  6192. {
  6193. llm = std::make_unique<llm_build_command_r>(*this, params);
  6194. } break;
  6195. case LLM_ARCH_COHERE2:
  6196. {
  6197. llm = std::make_unique<llm_build_cohere2_iswa>(*this, params);
  6198. } break;
  6199. case LLM_ARCH_DBRX:
  6200. {
  6201. llm = std::make_unique<llm_build_dbrx>(*this, params);
  6202. } break;
  6203. case LLM_ARCH_OLMO:
  6204. {
  6205. llm = std::make_unique<llm_build_olmo>(*this, params);
  6206. } break;
  6207. case LLM_ARCH_OLMO2:
  6208. {
  6209. if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
  6210. llm = std::make_unique<llm_build_olmo2<true>>(*this, params);
  6211. } else {
  6212. llm = std::make_unique<llm_build_olmo2<false>>(*this, params);
  6213. }
  6214. } break;
  6215. case LLM_ARCH_OLMOE:
  6216. {
  6217. llm = std::make_unique<llm_build_olmoe>(*this, params);
  6218. } break;
  6219. case LLM_ARCH_OPENELM:
  6220. {
  6221. llm = std::make_unique<llm_build_openelm>(*this, params);
  6222. } break;
  6223. case LLM_ARCH_GPTNEOX:
  6224. {
  6225. llm = std::make_unique<llm_build_gptneox>(*this, params);
  6226. } break;
  6227. case LLM_ARCH_ARCTIC:
  6228. {
  6229. llm = std::make_unique<llm_build_arctic>(*this, params);
  6230. } break;
  6231. case LLM_ARCH_DEEPSEEK:
  6232. {
  6233. llm = std::make_unique<llm_build_deepseek>(*this, params);
  6234. } break;
  6235. case LLM_ARCH_DEEPSEEK2:
  6236. {
  6237. llm = std::make_unique<llm_build_deepseek2>(*this, params);
  6238. } break;
  6239. case LLM_ARCH_CHATGLM:
  6240. {
  6241. llm = std::make_unique<llm_build_chatglm>(*this, params);
  6242. } break;
  6243. case LLM_ARCH_GLM4:
  6244. {
  6245. llm = std::make_unique<llm_build_glm4>(*this, params);
  6246. } break;
  6247. case LLM_ARCH_GLM4_MOE:
  6248. {
  6249. llm = std::make_unique<llm_build_glm4_moe>(*this, params);
  6250. } break;
  6251. case LLM_ARCH_BITNET:
  6252. {
  6253. llm = std::make_unique<llm_build_bitnet>(*this, params);
  6254. } break;
  6255. case LLM_ARCH_T5:
  6256. {
  6257. switch (params.gtype) {
  6258. case LLM_GRAPH_TYPE_ENCODER:
  6259. llm = std::make_unique<llm_build_t5_enc>(*this, params);
  6260. break;
  6261. case LLM_GRAPH_TYPE_DEFAULT:
  6262. case LLM_GRAPH_TYPE_DECODER:
  6263. llm = std::make_unique<llm_build_t5_dec>(*this, params);
  6264. break;
  6265. default:
  6266. GGML_ABORT("invalid graph type");
  6267. };
  6268. } break;
  6269. case LLM_ARCH_T5ENCODER:
  6270. {
  6271. llm = std::make_unique<llm_build_t5_enc>(*this, params);
  6272. }
  6273. break;
  6274. case LLM_ARCH_JAIS:
  6275. {
  6276. llm = std::make_unique<llm_build_jais>(*this, params);
  6277. } break;
  6278. case LLM_ARCH_NEMOTRON:
  6279. {
  6280. llm = std::make_unique<llm_build_nemotron>(*this, params);
  6281. } break;
  6282. case LLM_ARCH_NEMOTRON_H:
  6283. {
  6284. llm = std::make_unique<llm_build_nemotron_h>(*this, params);
  6285. } break;
  6286. case LLM_ARCH_EXAONE:
  6287. {
  6288. llm = std::make_unique<llm_build_exaone>(*this, params);
  6289. } break;
  6290. case LLM_ARCH_EXAONE4:
  6291. {
  6292. if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
  6293. llm = std::make_unique<llm_build_exaone4<true>>(*this, params);
  6294. } else {
  6295. llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
  6296. }
  6297. } break;
  6298. case LLM_ARCH_RWKV6:
  6299. {
  6300. llm = std::make_unique<llm_build_rwkv6>(*this, params);
  6301. } break;
  6302. case LLM_ARCH_RWKV6QWEN2:
  6303. {
  6304. llm = std::make_unique<llm_build_rwkv6qwen2>(*this, params);
  6305. } break;
  6306. case LLM_ARCH_RWKV7:
  6307. {
  6308. llm = std::make_unique<llm_build_rwkv7>(*this, params);
  6309. } break;
  6310. case LLM_ARCH_ARWKV7:
  6311. {
  6312. llm = std::make_unique<llm_build_arwkv7>(*this, params);
  6313. } break;
  6314. case LLM_ARCH_GRANITE:
  6315. case LLM_ARCH_GRANITE_MOE:
  6316. case LLM_ARCH_MINICPM:
  6317. {
  6318. llm = std::make_unique<llm_build_granite>(*this, params);
  6319. } break;
  6320. case LLM_ARCH_GRANITE_HYBRID:
  6321. {
  6322. llm = std::make_unique<llm_build_granite_hybrid>(*this, params);
  6323. } break;
  6324. case LLM_ARCH_CHAMELEON:
  6325. {
  6326. llm = std::make_unique<llm_build_chameleon>(*this, params);
  6327. } break;
  6328. case LLM_ARCH_WAVTOKENIZER_DEC:
  6329. {
  6330. llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
  6331. } break;
  6332. case LLM_ARCH_PLM:
  6333. {
  6334. llm = std::make_unique<llm_build_plm>(*this, params);
  6335. } break;
  6336. case LLM_ARCH_BAILINGMOE:
  6337. {
  6338. llm = std::make_unique<llm_build_bailingmoe>(*this, params);
  6339. } break;
  6340. case LLM_ARCH_BAILINGMOE2:
  6341. {
  6342. llm = std::make_unique<llm_build_bailingmoe2>(*this, params);
  6343. } break;
  6344. case LLM_ARCH_SEED_OSS:
  6345. {
  6346. llm = std::make_unique<llm_build_seed_oss>(*this, params);
  6347. } break;
  6348. case LLM_ARCH_DOTS1:
  6349. {
  6350. llm = std::make_unique<llm_build_dots1>(*this, params);
  6351. } break;
  6352. case LLM_ARCH_ARCEE:
  6353. {
  6354. llm = std::make_unique<llm_build_arcee>(*this, params);
  6355. } break;
  6356. case LLM_ARCH_AFMOE:
  6357. {
  6358. llm = std::make_unique<llm_build_afmoe>(*this, params);
  6359. } break;
  6360. case LLM_ARCH_ERNIE4_5:
  6361. {
  6362. llm = std::make_unique<llm_build_ernie4_5>(*this, params);
  6363. } break;
  6364. case LLM_ARCH_ERNIE4_5_MOE:
  6365. {
  6366. llm = std::make_unique<llm_build_ernie4_5_moe>(*this, params);
  6367. } break;
  6368. case LLM_ARCH_HUNYUAN_MOE:
  6369. {
  6370. llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
  6371. } break;
  6372. case LLM_ARCH_HUNYUAN_DENSE:
  6373. {
  6374. llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
  6375. } break;
  6376. case LLM_ARCH_SMOLLM3:
  6377. {
  6378. llm = std::make_unique<llm_build_smollm3>(*this, params);
  6379. } break;
  6380. case LLM_ARCH_OPENAI_MOE:
  6381. {
  6382. llm = std::make_unique<llm_build_openai_moe_iswa>(*this, params);
  6383. } break;
  6384. case LLM_ARCH_FALCON_H1:
  6385. {
  6386. llm = std::make_unique<llm_build_falcon_h1>(*this, params);
  6387. } break;
  6388. case LLM_ARCH_LFM2:
  6389. case LLM_ARCH_LFM2MOE:
  6390. {
  6391. llm = std::make_unique<llm_build_lfm2>(*this, params);
  6392. } break;
  6393. case LLM_ARCH_SMALLTHINKER:
  6394. {
  6395. if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
  6396. llm = std::make_unique<llm_build_smallthinker<true>> (*this, params);
  6397. } else {
  6398. llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
  6399. }
  6400. } break;
  6401. case LLM_ARCH_GROVEMOE:
  6402. {
  6403. llm = std::make_unique<llm_build_grovemoe>(*this, params);
  6404. } break;
  6405. case LLM_ARCH_APERTUS:
  6406. {
  6407. llm = std::make_unique<llm_build_apertus>(*this, params);
  6408. } break;
  6409. case LLM_ARCH_MINIMAX_M2:
  6410. {
  6411. llm = std::make_unique<llm_build_minimax_m2>(*this, params);
  6412. } break;
  6413. case LLM_ARCH_COGVLM:
  6414. {
  6415. llm = std::make_unique<llm_build_cogvlm>(*this, params);
  6416. } break;
  6417. case LLM_ARCH_PANGU_EMBED:
  6418. {
  6419. llm = std::make_unique<llm_build_pangu_embedded>(*this, params);
  6420. } break;
  6421. case LLM_ARCH_QWEN3NEXT:
  6422. {
  6423. llm = std::make_unique<llm_build_qwen3next>(*this, params);
  6424. } break;
  6425. case LLM_ARCH_MISTRAL3:
  6426. {
  6427. llm = std::make_unique<llm_build_mistral3>(*this, params);
  6428. } break;
  6429. default:
  6430. GGML_ABORT("fatal error");
  6431. }
  6432. // add on pooling layer
  6433. llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
  6434. // if the gguf model was converted with --sentence-transformers-dense-modules
  6435. // there will be two additional dense projection layers
  6436. // dense linear projections are applied after pooling
  6437. // TODO: move reranking logic here and generalize
  6438. llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
  6439. return llm->res->get_gf();
  6440. }
  6441. //
  6442. // interface implementation
  6443. //
  6444. llama_model_params llama_model_default_params() {
  6445. llama_model_params result = {
  6446. /*.devices =*/ nullptr,
  6447. /*.tensor_buft_overrides =*/ nullptr,
  6448. /*.n_gpu_layers =*/ 999,
  6449. /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
  6450. /*.main_gpu =*/ 0,
  6451. /*.tensor_split =*/ nullptr,
  6452. /*.progress_callback =*/ nullptr,
  6453. /*.progress_callback_user_data =*/ nullptr,
  6454. /*.kv_overrides =*/ nullptr,
  6455. /*.vocab_only =*/ false,
  6456. /*.use_mmap =*/ true,
  6457. /*.use_mlock =*/ false,
  6458. /*.check_tensors =*/ false,
  6459. /*.use_extra_bufts =*/ true,
  6460. /*.no_host =*/ false,
  6461. /*.no_alloc =*/ false,
  6462. };
  6463. return result;
  6464. }
  6465. const llama_vocab * llama_model_get_vocab(const llama_model * model) {
  6466. return &model->vocab;
  6467. }
  6468. void llama_free_model(llama_model * model) {
  6469. llama_model_free(model);
  6470. }
  6471. void llama_model_free(llama_model * model) {
  6472. delete model;
  6473. }
  6474. int32_t llama_model_n_ctx_train(const llama_model * model) {
  6475. return model->hparams.n_ctx_train;
  6476. }
  6477. int32_t llama_model_n_embd(const llama_model * model) {
  6478. return model->hparams.n_embd;
  6479. }
  6480. int32_t llama_model_n_embd_inp(const llama_model * model) {
  6481. return model->hparams.n_embd_inp();
  6482. }
  6483. int32_t llama_model_n_layer(const llama_model * model) {
  6484. return model->hparams.n_layer;
  6485. }
  6486. int32_t llama_model_n_head(const llama_model * model) {
  6487. return model->hparams.n_head();
  6488. }
  6489. int32_t llama_model_n_head_kv(const llama_model * model) {
  6490. return model->hparams.n_head_kv();
  6491. }
  6492. int32_t llama_model_n_swa(const llama_model * model) {
  6493. return model->hparams.n_swa;
  6494. }
  6495. uint32_t llama_model_n_cls_out(const struct llama_model * model) {
  6496. return model->hparams.n_cls_out;
  6497. }
  6498. const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
  6499. if (i < model->classifier_labels.size()) {
  6500. return model->classifier_labels[i].c_str();
  6501. }
  6502. return nullptr;
  6503. }
  6504. // deprecated
  6505. int32_t llama_n_ctx_train(const llama_model * model) {
  6506. return llama_model_n_ctx_train(model);
  6507. }
  6508. // deprecated
  6509. int32_t llama_n_embd(const llama_model * model) {
  6510. return llama_model_n_embd(model);
  6511. }
  6512. // deprecated
  6513. int32_t llama_n_layer(const llama_model * model) {
  6514. return llama_model_n_layer(model);
  6515. }
  6516. // deprecated
  6517. int32_t llama_n_head(const llama_model * model) {
  6518. return llama_model_n_head(model);
  6519. }
  6520. llama_rope_type llama_model_rope_type(const llama_model * model) {
  6521. switch (model->arch) {
  6522. // these models do not use RoPE
  6523. case LLM_ARCH_CLIP:
  6524. case LLM_ARCH_GPT2:
  6525. case LLM_ARCH_GPTJ:
  6526. case LLM_ARCH_MPT:
  6527. case LLM_ARCH_REFACT:
  6528. case LLM_ARCH_BLOOM:
  6529. case LLM_ARCH_MAMBA:
  6530. case LLM_ARCH_MAMBA2:
  6531. case LLM_ARCH_JAMBA:
  6532. case LLM_ARCH_JINA_BERT_V2:
  6533. case LLM_ARCH_T5:
  6534. case LLM_ARCH_T5ENCODER:
  6535. case LLM_ARCH_JAIS:
  6536. case LLM_ARCH_RWKV6:
  6537. case LLM_ARCH_RWKV6QWEN2:
  6538. case LLM_ARCH_RWKV7:
  6539. case LLM_ARCH_ARWKV7:
  6540. case LLM_ARCH_WAVTOKENIZER_DEC:
  6541. case LLM_ARCH_NEMOTRON_H:
  6542. return LLAMA_ROPE_TYPE_NONE;
  6543. // use what we call a normal RoPE, operating on pairs of consecutive head values
  6544. case LLM_ARCH_LLAMA:
  6545. case LLM_ARCH_LLADA:
  6546. case LLM_ARCH_LLAMA4:
  6547. case LLM_ARCH_DECI:
  6548. case LLM_ARCH_BAICHUAN:
  6549. case LLM_ARCH_STARCODER:
  6550. case LLM_ARCH_INTERNLM2:
  6551. case LLM_ARCH_MINICPM:
  6552. case LLM_ARCH_XVERSE:
  6553. case LLM_ARCH_COMMAND_R:
  6554. case LLM_ARCH_COHERE2:
  6555. case LLM_ARCH_OLMO:
  6556. case LLM_ARCH_ARCTIC:
  6557. case LLM_ARCH_DEEPSEEK:
  6558. case LLM_ARCH_DEEPSEEK2:
  6559. case LLM_ARCH_PLM:
  6560. case LLM_ARCH_CHATGLM:
  6561. case LLM_ARCH_GLM4:
  6562. case LLM_ARCH_GRANITE:
  6563. case LLM_ARCH_GRANITE_MOE:
  6564. case LLM_ARCH_GRANITE_HYBRID:
  6565. case LLM_ARCH_CHAMELEON:
  6566. case LLM_ARCH_BAILINGMOE:
  6567. case LLM_ARCH_NEO_BERT:
  6568. case LLM_ARCH_SMOLLM3:
  6569. case LLM_ARCH_ARCEE:
  6570. case LLM_ARCH_ERNIE4_5:
  6571. case LLM_ARCH_ERNIE4_5_MOE:
  6572. case LLM_ARCH_MISTRAL3:
  6573. return LLAMA_ROPE_TYPE_NORM;
  6574. // the pairs of head values are offset by n_rot/2
  6575. case LLM_ARCH_FALCON:
  6576. case LLM_ARCH_FALCON_H1:
  6577. case LLM_ARCH_GROK:
  6578. case LLM_ARCH_DBRX:
  6579. case LLM_ARCH_BERT:
  6580. case LLM_ARCH_JINA_BERT_V3:
  6581. case LLM_ARCH_NOMIC_BERT:
  6582. case LLM_ARCH_NOMIC_BERT_MOE:
  6583. case LLM_ARCH_STABLELM:
  6584. case LLM_ARCH_BITNET:
  6585. case LLM_ARCH_QWEN:
  6586. case LLM_ARCH_QWEN2:
  6587. case LLM_ARCH_DREAM:
  6588. case LLM_ARCH_QWEN2MOE:
  6589. case LLM_ARCH_QWEN3:
  6590. case LLM_ARCH_QWEN3MOE:
  6591. case LLM_ARCH_LLADA_MOE:
  6592. case LLM_ARCH_RND1:
  6593. case LLM_ARCH_OLMO2:
  6594. case LLM_ARCH_OLMOE:
  6595. case LLM_ARCH_PHI2:
  6596. case LLM_ARCH_PHI3:
  6597. case LLM_ARCH_PHIMOE:
  6598. case LLM_ARCH_PLAMO:
  6599. case LLM_ARCH_PLAMO2:
  6600. case LLM_ARCH_GEMMA:
  6601. case LLM_ARCH_GEMMA2:
  6602. case LLM_ARCH_GEMMA3:
  6603. case LLM_ARCH_GEMMA3N:
  6604. case LLM_ARCH_GEMMA_EMBEDDING:
  6605. case LLM_ARCH_STARCODER2:
  6606. case LLM_ARCH_OPENELM:
  6607. case LLM_ARCH_GPTNEOX:
  6608. case LLM_ARCH_CODESHELL:
  6609. case LLM_ARCH_ORION:
  6610. case LLM_ARCH_NEMOTRON:
  6611. case LLM_ARCH_EXAONE:
  6612. case LLM_ARCH_EXAONE4:
  6613. case LLM_ARCH_MINICPM3:
  6614. case LLM_ARCH_BAILINGMOE2:
  6615. case LLM_ARCH_DOTS1:
  6616. case LLM_ARCH_HUNYUAN_MOE:
  6617. case LLM_ARCH_OPENAI_MOE:
  6618. case LLM_ARCH_HUNYUAN_DENSE:
  6619. case LLM_ARCH_LFM2:
  6620. case LLM_ARCH_LFM2MOE:
  6621. case LLM_ARCH_SMALLTHINKER:
  6622. case LLM_ARCH_GLM4_MOE:
  6623. case LLM_ARCH_SEED_OSS:
  6624. case LLM_ARCH_GROVEMOE:
  6625. case LLM_ARCH_APERTUS:
  6626. case LLM_ARCH_MINIMAX_M2:
  6627. case LLM_ARCH_COGVLM:
  6628. case LLM_ARCH_PANGU_EMBED:
  6629. case LLM_ARCH_AFMOE:
  6630. case LLM_ARCH_QWEN3NEXT:
  6631. return LLAMA_ROPE_TYPE_NEOX;
  6632. case LLM_ARCH_QWEN2VL:
  6633. return LLAMA_ROPE_TYPE_MROPE;
  6634. case LLM_ARCH_QWEN3VL:
  6635. case LLM_ARCH_QWEN3VLMOE:
  6636. return LLAMA_ROPE_TYPE_IMROPE;
  6637. // all model arches should be listed explicitly here
  6638. case LLM_ARCH_UNKNOWN:
  6639. GGML_ABORT("unknown architecture");
  6640. }
  6641. return LLAMA_ROPE_TYPE_NONE;
  6642. }
  6643. float llama_model_rope_freq_scale_train(const llama_model * model) {
  6644. return model->hparams.rope_freq_scale_train;
  6645. }
  6646. int32_t llama_model_meta_val_str(const llama_model * model, const char * key, char * buf, size_t buf_size) {
  6647. const auto & it = model->gguf_kv.find(key);
  6648. if (it == model->gguf_kv.end()) {
  6649. if (buf_size > 0) {
  6650. buf[0] = '\0';
  6651. }
  6652. return -1;
  6653. }
  6654. return snprintf(buf, buf_size, "%s", it->second.c_str());
  6655. }
  6656. int32_t llama_model_meta_count(const llama_model * model) {
  6657. return (int)model->gguf_kv.size();
  6658. }
  6659. const char * llama_model_meta_key_str(llama_model_meta_key key) {
  6660. switch (key) {
  6661. case LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE: return "general.sampling.sequence";
  6662. case LLAMA_MODEL_META_KEY_SAMPLING_TOP_K: return "general.sampling.top_k";
  6663. case LLAMA_MODEL_META_KEY_SAMPLING_TOP_P: return "general.sampling.top_p";
  6664. case LLAMA_MODEL_META_KEY_SAMPLING_MIN_P: return "general.sampling.min_p";
  6665. case LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY: return "general.sampling.xtc_probability";
  6666. case LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD: return "general.sampling.xtc_threshold";
  6667. case LLAMA_MODEL_META_KEY_SAMPLING_TEMP: return "general.sampling.temp";
  6668. case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N: return "general.sampling.penalty_last_n";
  6669. case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT: return "general.sampling.penalty_repeat";
  6670. case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT: return "general.sampling.mirostat";
  6671. case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU: return "general.sampling.mirostat_tau";
  6672. case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA: return "general.sampling.mirostat_eta";
  6673. default: return nullptr;
  6674. }
  6675. }
  6676. int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
  6677. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  6678. if (buf_size > 0) {
  6679. buf[0] = '\0';
  6680. }
  6681. return -1;
  6682. }
  6683. auto it = model->gguf_kv.begin();
  6684. std::advance(it, i);
  6685. return snprintf(buf, buf_size, "%s", it->first.c_str());
  6686. }
  6687. int32_t llama_model_meta_val_str_by_index(const llama_model * model, int32_t i, char * buf, size_t buf_size) {
  6688. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  6689. if (buf_size > 0) {
  6690. buf[0] = '\0';
  6691. }
  6692. return -1;
  6693. }
  6694. auto it = model->gguf_kv.begin();
  6695. std::advance(it, i);
  6696. return snprintf(buf, buf_size, "%s", it->second.c_str());
  6697. }
  6698. int32_t llama_model_desc(const llama_model * model, char * buf, size_t buf_size) {
  6699. return snprintf(buf, buf_size, "%s", model->desc().c_str());
  6700. }
  6701. uint64_t llama_model_size(const llama_model * model) {
  6702. return model->size();
  6703. }
  6704. const char * llama_model_chat_template(const llama_model * model, const char * name) {
  6705. const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
  6706. : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
  6707. const auto & it = model->gguf_kv.find(key);
  6708. if (it == model->gguf_kv.end()) {
  6709. // one-off fix for very popular models (so we are not flooded with issues)
  6710. // do not extend this list unless absolutely necessary
  6711. // Mistral-Small-2503 does not have built-in chat template
  6712. llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
  6713. if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
  6714. return "mistral-v7-tekken";
  6715. }
  6716. return nullptr;
  6717. }
  6718. return it->second.c_str();
  6719. }
  6720. uint64_t llama_model_n_params(const llama_model * model) {
  6721. return model->n_elements();
  6722. }
  6723. bool llama_model_has_encoder(const llama_model * model) {
  6724. switch (model->arch) {
  6725. case LLM_ARCH_T5: return true;
  6726. case LLM_ARCH_T5ENCODER: return true;
  6727. default: return false;
  6728. }
  6729. }
  6730. bool llama_model_has_decoder(const llama_model * model) {
  6731. switch (model->arch) {
  6732. case LLM_ARCH_T5ENCODER: return false;
  6733. default: return true;
  6734. }
  6735. }
  6736. llama_token llama_model_decoder_start_token(const llama_model * model) {
  6737. return model->hparams.dec_start_token_id;
  6738. }
  6739. bool llama_model_is_recurrent(const llama_model * model) {
  6740. return llm_arch_is_recurrent(model->arch);
  6741. }
  6742. bool llama_model_is_hybrid(const llama_model * model) {
  6743. return llm_arch_is_hybrid(model->arch);
  6744. }
  6745. bool llama_model_is_diffusion(const llama_model * model) {
  6746. return llm_arch_is_diffusion(model->arch);
  6747. }
  6748. const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
  6749. return model->tensors_by_name;
  6750. }