llama-model.cpp 617 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558855985608561856285638564856585668567856885698570857185728573857485758576857785788579858085818582858385848585858685878588858985908591859285938594859585968597859885998600860186028603860486058606860786088609861086118612861386148615861686178618861986208621862286238624862586268627862886298630863186328633863486358636863786388639864086418642864386448645864686478648864986508651865286538654865586568657865886598660866186628663866486658666866786688669867086718672867386748675867686778678867986808681868286838684868586868687868886898690869186928693869486958696869786988699870087018702870387048705870687078708870987108711871287138714871587168717871887198720872187228723872487258726872787288729873087318732873387348735873687378738873987408741874287438744874587468747874887498750875187528753875487558756875787588759876087618762876387648765876687678768876987708771877287738774877587768777877887798780878187828783878487858786878787888789879087918792879387948795879687978798879988008801880288038804880588068807880888098810881188128813881488158816881788188819882088218822882388248825882688278828882988308831883288338834883588368837883888398840884188428843884488458846884788488849885088518852885388548855885688578858885988608861886288638864886588668867886888698870887188728873887488758876887788788879888088818882888388848885888688878888888988908891889288938894889588968897889888998900890189028903890489058906890789088909891089118912891389148915891689178918891989208921892289238924892589268927892889298930893189328933893489358936893789388939894089418942894389448945894689478948894989508951895289538954895589568957895889598960896189628963896489658966896789688969897089718972897389748975897689778978897989808981898289838984898589868987898889898990899189928993899489958996899789988999900090019002900390049005900690079008900990109011901290139014901590169017901890199020902190229023902490259026902790289029903090319032903390349035903690379038903990409041904290439044904590469047904890499050905190529053905490559056905790589059906090619062906390649065906690679068906990709071907290739074907590769077907890799080908190829083908490859086908790889089909090919092909390949095909690979098909991009101910291039104910591069107910891099110911191129113911491159116911791189119912091219122912391249125912691279128912991309131913291339134913591369137913891399140914191429143914491459146914791489149915091519152915391549155915691579158915991609161916291639164916591669167916891699170917191729173917491759176917791789179918091819182918391849185918691879188918991909191919291939194919591969197919891999200920192029203920492059206920792089209921092119212921392149215921692179218921992209221922292239224922592269227922892299230923192329233923492359236923792389239924092419242924392449245924692479248924992509251925292539254925592569257925892599260926192629263926492659266926792689269927092719272927392749275927692779278927992809281928292839284928592869287928892899290929192929293929492959296929792989299930093019302930393049305930693079308930993109311931293139314931593169317931893199320932193229323932493259326932793289329933093319332933393349335933693379338933993409341934293439344934593469347934893499350935193529353935493559356935793589359936093619362936393649365936693679368936993709371937293739374937593769377937893799380938193829383938493859386938793889389939093919392939393949395939693979398939994009401940294039404940594069407940894099410941194129413941494159416941794189419942094219422942394249425942694279428942994309431943294339434943594369437943894399440944194429443944494459446944794489449945094519452945394549455945694579458945994609461946294639464946594669467946894699470947194729473947494759476947794789479948094819482948394849485948694879488948994909491949294939494949594969497949894999500950195029503950495059506950795089509951095119512951395149515951695179518951995209521952295239524952595269527952895299530953195329533953495359536953795389539954095419542954395449545954695479548954995509551955295539554955595569557955895599560956195629563956495659566956795689569957095719572957395749575957695779578957995809581958295839584958595869587958895899590959195929593959495959596959795989599960096019602960396049605960696079608960996109611961296139614961596169617961896199620962196229623962496259626962796289629963096319632963396349635963696379638963996409641964296439644964596469647964896499650965196529653965496559656965796589659966096619662966396649665966696679668966996709671967296739674967596769677967896799680968196829683968496859686968796889689969096919692969396949695969696979698969997009701970297039704970597069707970897099710971197129713971497159716971797189719972097219722972397249725972697279728972997309731973297339734973597369737973897399740974197429743974497459746974797489749975097519752975397549755975697579758975997609761976297639764976597669767976897699770977197729773977497759776977797789779978097819782978397849785978697879788978997909791979297939794979597969797979897999800980198029803980498059806980798089809981098119812981398149815981698179818981998209821982298239824982598269827982898299830983198329833983498359836983798389839984098419842984398449845984698479848984998509851985298539854985598569857985898599860986198629863986498659866986798689869987098719872987398749875987698779878987998809881988298839884988598869887988898899890989198929893989498959896989798989899990099019902990399049905990699079908990999109911991299139914991599169917991899199920992199229923992499259926992799289929993099319932993399349935993699379938993999409941994299439944994599469947994899499950995199529953995499559956995799589959996099619962996399649965996699679968996999709971997299739974997599769977997899799980998199829983998499859986998799889989999099919992999399949995999699979998999910000100011000210003100041000510006100071000810009100101001110012100131001410015100161001710018100191002010021100221002310024100251002610027100281002910030100311003210033100341003510036100371003810039100401004110042100431004410045100461004710048100491005010051100521005310054100551005610057100581005910060100611006210063100641006510066100671006810069100701007110072100731007410075100761007710078100791008010081100821008310084100851008610087100881008910090100911009210093100941009510096100971009810099101001010110102101031010410105101061010710108101091011010111101121011310114101151011610117101181011910120101211012210123101241012510126101271012810129101301013110132101331013410135101361013710138101391014010141101421014310144101451014610147101481014910150101511015210153101541015510156101571015810159101601016110162101631016410165101661016710168101691017010171101721017310174101751017610177101781017910180101811018210183101841018510186101871018810189101901019110192101931019410195101961019710198101991020010201102021020310204102051020610207102081020910210102111021210213102141021510216102171021810219102201022110222102231022410225102261022710228102291023010231102321023310234102351023610237102381023910240102411024210243102441024510246102471024810249102501025110252102531025410255102561025710258102591026010261102621026310264102651026610267102681026910270102711027210273102741027510276102771027810279102801028110282102831028410285102861028710288102891029010291102921029310294102951029610297102981029910300103011030210303103041030510306103071030810309103101031110312103131031410315103161031710318103191032010321103221032310324103251032610327103281032910330103311033210333103341033510336103371033810339103401034110342103431034410345103461034710348103491035010351103521035310354103551035610357103581035910360103611036210363103641036510366103671036810369103701037110372103731037410375103761037710378103791038010381103821038310384103851038610387103881038910390103911039210393103941039510396103971039810399104001040110402104031040410405104061040710408104091041010411104121041310414104151041610417104181041910420104211042210423104241042510426104271042810429104301043110432104331043410435104361043710438104391044010441104421044310444104451044610447104481044910450104511045210453104541045510456104571045810459104601046110462104631046410465104661046710468104691047010471104721047310474104751047610477104781047910480104811048210483104841048510486104871048810489104901049110492104931049410495104961049710498104991050010501105021050310504105051050610507105081050910510105111051210513105141051510516105171051810519105201052110522105231052410525105261052710528105291053010531105321053310534105351053610537105381053910540105411054210543105441054510546105471054810549105501055110552105531055410555105561055710558105591056010561105621056310564105651056610567105681056910570105711057210573105741057510576105771057810579105801058110582105831058410585105861058710588105891059010591105921059310594105951059610597105981059910600106011060210603106041060510606106071060810609106101061110612106131061410615106161061710618106191062010621106221062310624106251062610627106281062910630106311063210633106341063510636106371063810639106401064110642106431064410645106461064710648106491065010651106521065310654106551065610657106581065910660106611066210663106641066510666106671066810669106701067110672106731067410675106761067710678106791068010681106821068310684106851068610687106881068910690106911069210693106941069510696106971069810699107001070110702107031070410705107061070710708107091071010711107121071310714107151071610717107181071910720107211072210723107241072510726107271072810729107301073110732107331073410735107361073710738107391074010741107421074310744107451074610747107481074910750107511075210753107541075510756107571075810759107601076110762107631076410765107661076710768107691077010771107721077310774107751077610777107781077910780107811078210783107841078510786107871078810789107901079110792107931079410795107961079710798107991080010801108021080310804108051080610807108081080910810108111081210813108141081510816108171081810819108201082110822108231082410825108261082710828108291083010831108321083310834108351083610837108381083910840108411084210843108441084510846108471084810849108501085110852108531085410855108561085710858108591086010861108621086310864108651086610867108681086910870108711087210873108741087510876108771087810879108801088110882108831088410885108861088710888108891089010891108921089310894108951089610897108981089910900109011090210903109041090510906109071090810909109101091110912109131091410915109161091710918109191092010921109221092310924109251092610927109281092910930109311093210933109341093510936109371093810939109401094110942109431094410945109461094710948109491095010951109521095310954109551095610957109581095910960109611096210963109641096510966109671096810969109701097110972109731097410975109761097710978109791098010981109821098310984109851098610987109881098910990109911099210993109941099510996109971099810999110001100111002110031100411005110061100711008110091101011011110121101311014110151101611017110181101911020110211102211023110241102511026110271102811029110301103111032110331103411035110361103711038110391104011041110421104311044110451104611047110481104911050110511105211053110541105511056110571105811059110601106111062110631106411065110661106711068110691107011071110721107311074110751107611077110781107911080110811108211083110841108511086110871108811089110901109111092110931109411095110961109711098110991110011101111021110311104111051110611107111081110911110111111111211113111141111511116111171111811119111201112111122111231112411125111261112711128111291113011131111321113311134111351113611137111381113911140111411114211143111441114511146111471114811149111501115111152111531115411155111561115711158111591116011161111621116311164111651116611167111681116911170111711117211173111741117511176111771117811179111801118111182111831118411185111861118711188111891119011191111921119311194111951119611197111981119911200112011120211203112041120511206112071120811209112101121111212112131121411215112161121711218112191122011221112221122311224112251122611227112281122911230112311123211233112341123511236112371123811239112401124111242112431124411245112461124711248112491125011251112521125311254112551125611257112581125911260112611126211263112641126511266112671126811269112701127111272112731127411275112761127711278112791128011281112821128311284112851128611287112881128911290112911129211293112941129511296112971129811299113001130111302113031130411305113061130711308113091131011311113121131311314113151131611317113181131911320113211132211323113241132511326113271132811329113301133111332113331133411335113361133711338113391134011341113421134311344113451134611347113481134911350113511135211353113541135511356113571135811359113601136111362113631136411365113661136711368113691137011371113721137311374113751137611377113781137911380113811138211383113841138511386113871138811389113901139111392113931139411395113961139711398113991140011401114021140311404114051140611407114081140911410114111141211413114141141511416114171141811419114201142111422114231142411425114261142711428114291143011431114321143311434114351143611437114381143911440114411144211443114441144511446114471144811449114501145111452114531145411455114561145711458114591146011461114621146311464114651146611467114681146911470114711147211473114741147511476114771147811479114801148111482114831148411485114861148711488114891149011491114921149311494114951149611497114981149911500115011150211503115041150511506115071150811509115101151111512115131151411515115161151711518115191152011521115221152311524115251152611527115281152911530115311153211533115341153511536115371153811539115401154111542115431154411545115461154711548115491155011551115521155311554115551155611557115581155911560115611156211563115641156511566115671156811569115701157111572115731157411575115761157711578115791158011581115821158311584115851158611587115881158911590115911159211593115941159511596115971159811599116001160111602116031160411605116061160711608116091161011611116121161311614116151161611617116181161911620116211162211623116241162511626116271162811629116301163111632116331163411635116361163711638116391164011641116421164311644116451164611647116481164911650116511165211653116541165511656116571165811659116601166111662116631166411665116661166711668116691167011671116721167311674116751167611677116781167911680116811168211683116841168511686116871168811689116901169111692116931169411695116961169711698116991170011701117021170311704117051170611707117081170911710117111171211713117141171511716117171171811719117201172111722117231172411725117261172711728117291173011731117321173311734117351173611737117381173911740117411174211743117441174511746117471174811749117501175111752117531175411755117561175711758117591176011761117621176311764117651176611767117681176911770117711177211773117741177511776117771177811779117801178111782117831178411785117861178711788117891179011791117921179311794117951179611797117981179911800118011180211803118041180511806118071180811809118101181111812118131181411815118161181711818118191182011821118221182311824118251182611827118281182911830118311183211833118341183511836118371183811839118401184111842118431184411845118461184711848118491185011851118521185311854118551185611857118581185911860118611186211863118641186511866118671186811869118701187111872118731187411875118761187711878118791188011881118821188311884118851188611887118881188911890118911189211893118941189511896118971189811899119001190111902119031190411905119061190711908119091191011911119121191311914119151191611917119181191911920119211192211923119241192511926119271192811929119301193111932119331193411935119361193711938119391194011941119421194311944119451194611947119481194911950119511195211953119541195511956119571195811959119601196111962119631196411965119661196711968119691197011971119721197311974119751197611977119781197911980119811198211983119841198511986119871198811989119901199111992119931199411995119961199711998119991200012001120021200312004120051200612007120081200912010120111201212013120141201512016120171201812019120201202112022120231202412025120261202712028120291203012031120321203312034120351203612037120381203912040120411204212043120441204512046120471204812049120501205112052120531205412055120561205712058120591206012061120621206312064120651206612067120681206912070120711207212073120741207512076120771207812079120801208112082120831208412085120861208712088120891209012091120921209312094120951209612097120981209912100121011210212103121041210512106121071210812109121101211112112121131211412115121161211712118121191212012121121221212312124121251212612127121281212912130121311213212133121341213512136121371213812139121401214112142121431214412145121461214712148121491215012151121521215312154121551215612157121581215912160121611216212163121641216512166121671216812169121701217112172121731217412175121761217712178121791218012181121821218312184121851218612187121881218912190121911219212193121941219512196121971219812199122001220112202122031220412205122061220712208122091221012211122121221312214122151221612217122181221912220122211222212223122241222512226122271222812229122301223112232122331223412235122361223712238122391224012241122421224312244122451224612247122481224912250122511225212253122541225512256122571225812259122601226112262122631226412265122661226712268122691227012271122721227312274122751227612277122781227912280122811228212283122841228512286122871228812289122901229112292122931229412295122961229712298122991230012301123021230312304123051230612307123081230912310123111231212313123141231512316123171231812319123201232112322123231232412325123261232712328123291233012331123321233312334123351233612337123381233912340123411234212343123441234512346123471234812349123501235112352123531235412355123561235712358123591236012361123621236312364123651236612367123681236912370123711237212373123741237512376123771237812379123801238112382123831238412385123861238712388123891239012391123921239312394123951239612397123981239912400124011240212403124041240512406124071240812409124101241112412124131241412415124161241712418124191242012421124221242312424124251242612427124281242912430124311243212433124341243512436124371243812439124401244112442124431244412445124461244712448124491245012451124521245312454124551245612457124581245912460124611246212463124641246512466124671246812469124701247112472124731247412475124761247712478124791248012481124821248312484124851248612487124881248912490124911249212493124941249512496124971249812499125001250112502125031250412505125061250712508125091251012511125121251312514125151251612517125181251912520125211252212523125241252512526125271252812529125301253112532125331253412535125361253712538125391254012541125421254312544125451254612547125481254912550125511255212553125541255512556125571255812559125601256112562125631256412565125661256712568125691257012571125721257312574125751257612577125781257912580125811258212583125841258512586125871258812589125901259112592125931259412595125961259712598125991260012601126021260312604126051260612607126081260912610126111261212613126141261512616126171261812619126201262112622126231262412625126261262712628126291263012631126321263312634126351263612637126381263912640126411264212643126441264512646126471264812649126501265112652126531265412655126561265712658126591266012661126621266312664126651266612667126681266912670126711267212673126741267512676126771267812679126801268112682126831268412685126861268712688126891269012691126921269312694126951269612697126981269912700127011270212703127041270512706127071270812709127101271112712127131271412715127161271712718127191272012721127221272312724127251272612727127281272912730127311273212733127341273512736127371273812739127401274112742127431274412745127461274712748127491275012751127521275312754127551275612757127581275912760127611276212763127641276512766127671276812769127701277112772127731277412775127761277712778127791278012781127821278312784127851278612787127881278912790127911279212793127941279512796127971279812799128001280112802128031280412805128061280712808128091281012811128121281312814128151281612817128181281912820128211282212823128241282512826128271282812829128301283112832128331283412835128361283712838128391284012841128421284312844128451284612847128481284912850128511285212853128541285512856128571285812859128601286112862128631286412865128661286712868128691287012871128721287312874128751287612877128781287912880128811288212883128841288512886128871288812889128901289112892128931289412895128961289712898128991290012901129021290312904129051290612907129081290912910129111291212913129141291512916129171291812919129201292112922129231292412925129261292712928129291293012931129321293312934129351293612937129381293912940129411294212943129441294512946129471294812949129501295112952129531295412955129561295712958129591296012961129621296312964129651296612967129681296912970129711297212973129741297512976129771297812979129801298112982129831298412985129861298712988129891299012991129921299312994129951299612997129981299913000130011300213003130041300513006130071300813009130101301113012130131301413015130161301713018130191302013021130221302313024130251302613027130281302913030130311303213033130341303513036130371303813039130401304113042130431304413045130461304713048130491305013051130521305313054130551305613057130581305913060130611306213063130641306513066130671306813069130701307113072130731307413075130761307713078130791308013081130821308313084130851308613087130881308913090130911309213093130941309513096130971309813099131001310113102131031310413105131061310713108131091311013111131121311313114131151311613117131181311913120131211312213123131241312513126131271312813129131301313113132131331313413135131361313713138131391314013141131421314313144131451314613147131481314913150131511315213153131541315513156131571315813159131601316113162131631316413165131661316713168131691317013171131721317313174131751317613177131781317913180131811318213183131841318513186131871318813189131901319113192131931319413195131961319713198131991320013201132021320313204132051320613207132081320913210132111321213213132141321513216132171321813219132201322113222132231322413225132261322713228132291323013231132321323313234132351323613237132381323913240132411324213243132441324513246132471324813249132501325113252132531325413255132561325713258132591326013261132621326313264132651326613267132681326913270132711327213273132741327513276132771327813279132801328113282132831328413285132861328713288132891329013291132921329313294132951329613297132981329913300133011330213303133041330513306133071330813309133101331113312133131331413315133161331713318133191332013321133221332313324133251332613327133281332913330133311333213333133341333513336133371333813339133401334113342133431334413345133461334713348133491335013351133521335313354133551335613357133581335913360133611336213363133641336513366133671336813369133701337113372133731337413375133761337713378133791338013381133821338313384133851338613387133881338913390133911339213393133941339513396133971339813399134001340113402134031340413405134061340713408134091341013411134121341313414134151341613417134181341913420134211342213423134241342513426134271342813429134301343113432134331343413435134361343713438134391344013441134421344313444134451344613447134481344913450134511345213453134541345513456134571345813459134601346113462134631346413465134661346713468134691347013471134721347313474134751347613477134781347913480134811348213483134841348513486134871348813489134901349113492134931349413495134961349713498134991350013501135021350313504135051350613507135081350913510135111351213513135141351513516135171351813519135201352113522135231352413525135261352713528135291353013531135321353313534135351353613537135381353913540135411354213543135441354513546135471354813549135501355113552135531355413555135561355713558135591356013561135621356313564135651356613567135681356913570135711357213573135741357513576135771357813579135801358113582135831358413585135861358713588135891359013591135921359313594135951359613597135981359913600136011360213603136041360513606136071360813609136101361113612136131361413615136161361713618136191362013621136221362313624136251362613627136281362913630136311363213633136341363513636136371363813639136401364113642136431364413645136461364713648136491365013651136521365313654136551365613657136581365913660136611366213663136641366513666136671366813669136701367113672136731367413675136761367713678136791368013681136821368313684136851368613687136881368913690136911369213693136941369513696136971369813699137001370113702137031370413705137061370713708137091371013711137121371313714137151371613717137181371913720137211372213723137241372513726137271372813729137301373113732137331373413735137361373713738137391374013741137421374313744137451374613747137481374913750137511375213753137541375513756137571375813759137601376113762137631376413765137661376713768137691377013771137721377313774137751377613777137781377913780137811378213783137841378513786137871378813789137901379113792137931379413795137961379713798137991380013801138021380313804138051380613807138081380913810138111381213813138141381513816138171381813819138201382113822138231382413825138261382713828138291383013831138321383313834138351383613837138381383913840138411384213843138441384513846138471384813849138501385113852138531385413855138561385713858138591386013861138621386313864138651386613867138681386913870138711387213873138741387513876138771387813879138801388113882138831388413885138861388713888138891389013891138921389313894138951389613897138981389913900139011390213903139041390513906139071390813909139101391113912139131391413915139161391713918139191392013921139221392313924139251392613927139281392913930139311393213933139341393513936139371393813939139401394113942139431394413945139461394713948139491395013951139521395313954139551395613957139581395913960139611396213963139641396513966139671396813969139701397113972139731397413975139761397713978139791398013981139821398313984139851398613987139881398913990139911399213993139941399513996139971399813999140001400114002140031400414005140061400714008140091401014011140121401314014140151401614017140181401914020140211402214023140241402514026140271402814029140301403114032140331403414035140361403714038140391404014041140421404314044140451404614047140481404914050140511405214053140541405514056140571405814059140601406114062140631406414065140661406714068140691407014071140721407314074140751407614077140781407914080140811408214083140841408514086140871408814089140901409114092140931409414095140961409714098140991410014101141021410314104141051410614107141081410914110141111411214113141141411514116141171411814119141201412114122141231412414125141261412714128141291413014131141321413314134141351413614137141381413914140141411414214143141441414514146141471414814149141501415114152141531415414155141561415714158141591416014161141621416314164141651416614167141681416914170141711417214173141741417514176141771417814179141801418114182141831418414185141861418714188141891419014191141921419314194141951419614197141981419914200142011420214203142041420514206142071420814209142101421114212142131421414215142161421714218142191422014221142221422314224142251422614227142281422914230142311423214233142341423514236142371423814239142401424114242142431424414245142461424714248142491425014251142521425314254142551425614257142581425914260142611426214263142641426514266142671426814269142701427114272142731427414275142761427714278142791428014281142821428314284142851428614287142881428914290142911429214293142941429514296142971429814299143001430114302143031430414305143061430714308143091431014311143121431314314143151431614317143181431914320143211432214323143241432514326143271432814329143301433114332143331433414335143361433714338143391434014341143421434314344143451434614347143481434914350143511435214353143541435514356143571435814359143601436114362143631436414365143661436714368143691437014371143721437314374143751437614377143781437914380143811438214383143841438514386143871438814389143901439114392143931439414395143961439714398143991440014401144021440314404144051440614407144081440914410144111441214413144141441514416144171441814419
  1. #include "llama-model.h"
  2. #include "llama-impl.h"
  3. #include "llama-mmap.h"
  4. #include "llama-batch.h"
  5. #include "llama-cparams.h"
  6. #include "llama-model-loader.h"
  7. #include "llama-kv-cache-unified.h"
  8. #include "llama-kv-cache-unified-iswa.h"
  9. #include "llama-memory-hybrid.h"
  10. #include "llama-memory-recurrent.h"
  11. #include "ggml-cpp.h"
  12. #include <algorithm>
  13. #include <cassert>
  14. #include <cmath>
  15. #include <cfloat>
  16. #include <cstring>
  17. #include <cmath>
  18. #include <functional>
  19. #include <map>
  20. #include <regex>
  21. #include <sstream>
  22. #include <stdexcept>
  23. const char * llm_type_name(llm_type type) {
  24. switch (type) {
  25. case LLM_TYPE_14M: return "14M";
  26. case LLM_TYPE_17M: return "17M";
  27. case LLM_TYPE_22M: return "22M";
  28. case LLM_TYPE_33M: return "33M";
  29. case LLM_TYPE_60M: return "60M";
  30. case LLM_TYPE_70M: return "70M";
  31. case LLM_TYPE_80M: return "80M";
  32. case LLM_TYPE_109M: return "109M";
  33. case LLM_TYPE_137M: return "137M";
  34. case LLM_TYPE_160M: return "160M";
  35. case LLM_TYPE_190M: return "190M";
  36. case LLM_TYPE_220M: return "220M";
  37. case LLM_TYPE_250M: return "250M";
  38. case LLM_TYPE_270M: return "270M";
  39. case LLM_TYPE_335M: return "335M";
  40. case LLM_TYPE_410M: return "410M";
  41. case LLM_TYPE_450M: return "450M";
  42. case LLM_TYPE_475M: return "475M";
  43. case LLM_TYPE_770M: return "770M";
  44. case LLM_TYPE_780M: return "780M";
  45. case LLM_TYPE_0_5B: return "0.5B";
  46. case LLM_TYPE_0_6B: return "0.6B";
  47. case LLM_TYPE_1B: return "1B";
  48. case LLM_TYPE_1_3B: return "1.3B";
  49. case LLM_TYPE_1_4B: return "1.4B";
  50. case LLM_TYPE_1_5B: return "1.5B";
  51. case LLM_TYPE_1_6B: return "1.6B";
  52. case LLM_TYPE_1_7B: return "1.7B";
  53. case LLM_TYPE_1_8B: return "1.8B";
  54. case LLM_TYPE_2B: return "2B";
  55. case LLM_TYPE_2_8B: return "2.8B";
  56. case LLM_TYPE_2_9B: return "2.9B";
  57. case LLM_TYPE_3B: return "3B";
  58. case LLM_TYPE_4B: return "4B";
  59. case LLM_TYPE_6B: return "6B";
  60. case LLM_TYPE_6_9B: return "6.9B";
  61. case LLM_TYPE_7B: return "7B";
  62. case LLM_TYPE_8B: return "8B";
  63. case LLM_TYPE_9B: return "9B";
  64. case LLM_TYPE_11B: return "11B";
  65. case LLM_TYPE_12B: return "12B";
  66. case LLM_TYPE_13B: return "13B";
  67. case LLM_TYPE_14B: return "14B";
  68. case LLM_TYPE_15B: return "15B";
  69. case LLM_TYPE_16B: return "16B";
  70. case LLM_TYPE_20B: return "20B";
  71. case LLM_TYPE_27B: return "27B";
  72. case LLM_TYPE_30B: return "30B";
  73. case LLM_TYPE_32B: return "32B";
  74. case LLM_TYPE_34B: return "34B";
  75. case LLM_TYPE_35B: return "35B";
  76. case LLM_TYPE_40B: return "40B";
  77. case LLM_TYPE_65B: return "65B";
  78. case LLM_TYPE_70B: return "70B";
  79. case LLM_TYPE_142B: return "142B";
  80. case LLM_TYPE_236B: return "236B";
  81. case LLM_TYPE_290B: return "290B";
  82. case LLM_TYPE_314B: return "314B";
  83. case LLM_TYPE_405B: return "405B";
  84. case LLM_TYPE_671B: return "671B";
  85. case LLM_TYPE_SMALL: return "0.1B";
  86. case LLM_TYPE_MEDIUM: return "0.4B";
  87. case LLM_TYPE_LARGE: return "0.8B";
  88. case LLM_TYPE_XL: return "1.5B";
  89. case LLM_TYPE_A1_7B: return "A1.7B";
  90. case LLM_TYPE_A2_7B: return "A2.7B";
  91. case LLM_TYPE_8x7B: return "8x7B";
  92. case LLM_TYPE_8x22B: return "8x22B";
  93. case LLM_TYPE_16x12B: return "16x12B";
  94. case LLM_TYPE_16x3_8B: return "16x3.8B";
  95. case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
  96. case LLM_TYPE_57B_A14B: return "57B.A14B";
  97. case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
  98. case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
  99. case LLM_TYPE_30B_A3B: return "30B.A3B";
  100. case LLM_TYPE_235B_A22B: return "235B.A22B";
  101. default: return "?B";
  102. }
  103. }
  104. static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
  105. switch (type) {
  106. case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
  107. case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
  108. default: return "unknown";
  109. }
  110. }
  111. static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
  112. { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
  113. { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
  114. { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
  115. { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
  116. };
  117. std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
  118. return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
  119. }
  120. static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
  121. for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
  122. if (kv.second == name) {
  123. return (llama_rope_scaling_type) kv.first;
  124. }
  125. }
  126. return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
  127. }
  128. // checks if the weight tensor can be used with the specified buffer type and device
  129. static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
  130. GGML_ASSERT(w != nullptr);
  131. if (op == GGML_OP_NONE) {
  132. return true;
  133. }
  134. ggml_init_params params = {
  135. /*.mem_size =*/ ggml_tensor_overhead()*8,
  136. /*.mem_buffer =*/ NULL,
  137. /*.no_alloc =*/ true,
  138. };
  139. ggml_context_ptr ctx_ptr { ggml_init(params) };
  140. if (!ctx_ptr) {
  141. throw std::runtime_error(format("failed to create ggml context"));
  142. }
  143. ggml_context * ctx = ctx_ptr.get();
  144. ggml_tensor * op_tensor = nullptr;
  145. switch (op) {
  146. case GGML_OP_GET_ROWS:
  147. {
  148. ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
  149. op_tensor = ggml_get_rows(ctx, w, b);
  150. } break;
  151. case GGML_OP_MUL_MAT:
  152. {
  153. ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
  154. op_tensor = ggml_mul_mat(ctx, w, b);
  155. } break;
  156. case GGML_OP_MUL_MAT_ID:
  157. {
  158. int n_expert_used = hparams.n_expert_used;
  159. ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
  160. ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
  161. op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
  162. } break;
  163. case GGML_OP_ADD:
  164. {
  165. ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
  166. op_tensor = ggml_add(ctx, a, w);
  167. } break;
  168. case GGML_OP_MUL:
  169. {
  170. ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
  171. op_tensor = ggml_mul(ctx, a, w);
  172. } break;
  173. case GGML_OP_DIV:
  174. {
  175. ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
  176. op_tensor = ggml_div(ctx, a, w);
  177. } break;
  178. case GGML_OP_ROPE:
  179. {
  180. int n_embd_head = hparams.n_embd_head_v;
  181. int n_head = hparams.n_head();
  182. ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
  183. ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
  184. op_tensor = ggml_rope_ext(
  185. ctx, a, b, w,
  186. 0, 0, 0, 0, 0,
  187. 0, 0, 0, 0
  188. );
  189. } break;
  190. case GGML_OP_SSM_CONV:
  191. {
  192. // FIXME
  193. ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 12345, w->ne[1], 6789);
  194. op_tensor = ggml_ssm_conv(ctx, conv_x, w);
  195. } break;
  196. case GGML_OP_SSM_SCAN:
  197. {
  198. // FIXME
  199. const int64_t d_state = w->ne[0];
  200. const int64_t d_inner = w->ne[1];
  201. const int64_t n_seq_tokens = 512;
  202. const int64_t n_seqs = 1;
  203. ggml_tensor * s = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, d_inner, n_seqs);
  204. ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
  205. ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
  206. ggml_tensor * B = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
  207. ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
  208. op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C);
  209. } break;
  210. case GGML_OP_RWKV_WKV6:
  211. {
  212. // FIXME
  213. const int64_t S = 123;
  214. const int64_t H = 123;
  215. const int64_t n_tokens = 123;
  216. const int64_t n_seqs = 123;
  217. ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  218. ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  219. ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  220. ggml_tensor * tf = w;
  221. ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  222. ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
  223. op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
  224. } break;
  225. case GGML_OP_IM2COL:
  226. {
  227. const int n_embd = hparams.n_embd;
  228. ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
  229. op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
  230. } break;
  231. default:
  232. GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
  233. }
  234. // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
  235. GGML_ASSERT(w->buffer == nullptr);
  236. w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
  237. bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  238. ggml_backend_buffer_free(w->buffer);
  239. w->buffer = nullptr;
  240. return op_supported;
  241. }
  242. // lists of buffer types used for each layer
  243. using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
  244. // find the first buffer type in the list that can use the tensor
  245. static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
  246. GGML_ASSERT(!buft_list.empty());
  247. for (const auto & cur : buft_list) {
  248. ggml_backend_dev_t cur_dev = cur.first;
  249. ggml_backend_buffer_type_t cur_buft = cur.second;
  250. if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
  251. return cur_buft;
  252. }
  253. }
  254. return nullptr;
  255. }
  256. // CPU: ACCEL -> GPU host -> CPU extra -> CPU
  257. static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
  258. buft_list_t buft_list;
  259. // add ACCEL buffer types
  260. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  261. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  262. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
  263. auto * buft = ggml_backend_dev_buffer_type(dev);
  264. // skip
  265. if (buft != ggml_backend_cpu_buffer_type()) {
  266. buft_list.emplace_back(dev, buft);
  267. }
  268. }
  269. }
  270. // add a host buffer type
  271. // storing the tensors in a host buffer is useful when the processing of large batches
  272. // is offloaded to a GPU device, since it reduces the time spent on data transfers
  273. // generally, this will be done using the first device in the list
  274. // a better approach would be to handle this on a weight-by-weight basis using the offload_op
  275. // function of the device to determine if it would benefit from being stored in a host buffer
  276. for (auto * dev : devices) {
  277. ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
  278. if (buft) {
  279. buft_list.emplace_back(dev, buft);
  280. break;
  281. }
  282. }
  283. // add extra buffer types, only if no GPU device is present
  284. // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
  285. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  286. if (cpu_dev == nullptr) {
  287. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  288. }
  289. auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
  290. auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
  291. ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
  292. if (ggml_backend_dev_get_extra_bufts_fn) {
  293. ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
  294. while (extra_bufts && *extra_bufts) {
  295. buft_list.emplace_back(cpu_dev, *extra_bufts);
  296. ++extra_bufts;
  297. }
  298. }
  299. // add the CPU buffer type
  300. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  301. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  302. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
  303. buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
  304. }
  305. }
  306. return buft_list;
  307. }
  308. // GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
  309. static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode split_mode, const float * tensor_split) {
  310. buft_list_t buft_list;
  311. // add the device split buffer type if requested and available
  312. if (split_mode == LLAMA_SPLIT_MODE_ROW) {
  313. ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
  314. auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
  315. ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
  316. if (ggml_backend_split_buffer_type_fn) {
  317. size_t dev_index = [&]() {
  318. auto * reg = ggml_backend_dev_backend_reg(dev);
  319. for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
  320. if (ggml_backend_reg_dev_get(reg, i) == dev) {
  321. return i;
  322. }
  323. }
  324. throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
  325. }();
  326. auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
  327. if (buft != nullptr) {
  328. buft_list.emplace_back(dev, buft);
  329. }
  330. }
  331. }
  332. // add the device default buffer type
  333. buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
  334. return buft_list;
  335. }
  336. struct llama_model::impl {
  337. impl() {}
  338. ~impl() {}
  339. uint64_t n_elements = 0;
  340. size_t n_bytes = 0;
  341. std::string desc_str;
  342. // model memory mapped files
  343. llama_mmaps mappings;
  344. // objects representing data potentially being locked in memory
  345. llama_mlocks mlock_bufs;
  346. llama_mlocks mlock_mmaps;
  347. // contexts where the model tensors metadata is stored
  348. std::vector<ggml_context_ptr> ctxs;
  349. // the model memory buffers for the tensor data
  350. std::vector<ggml_backend_buffer_ptr> bufs;
  351. buft_list_t cpu_buft_list;
  352. std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
  353. struct layer_dev {
  354. ggml_backend_dev_t dev;
  355. buft_list_t * buft_list;
  356. };
  357. layer_dev dev_input = {};
  358. layer_dev dev_output = {};
  359. std::vector<layer_dev> dev_layer;
  360. bool has_tensor_overrides;
  361. };
  362. llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
  363. pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
  364. }
  365. llama_model::~llama_model() {}
  366. void llama_model::load_stats(llama_model_loader & ml) {
  367. pimpl->n_elements = ml.n_elements;
  368. pimpl->n_bytes = ml.n_bytes;
  369. }
  370. void llama_model::load_arch(llama_model_loader & ml) {
  371. arch = ml.get_arch();
  372. if (arch == LLM_ARCH_UNKNOWN) {
  373. throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
  374. }
  375. }
  376. void llama_model::load_hparams(llama_model_loader & ml) {
  377. const gguf_context * ctx = ml.meta.get();
  378. // get metadata as string
  379. for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
  380. gguf_type type = gguf_get_kv_type(ctx, i);
  381. if (type == GGUF_TYPE_ARRAY) {
  382. continue;
  383. }
  384. const char * name = gguf_get_key(ctx, i);
  385. const std::string value = gguf_kv_to_str(ctx, i);
  386. gguf_kv.emplace(name, value);
  387. }
  388. // get general kv
  389. ml.get_key(LLM_KV_GENERAL_NAME, name, false);
  390. // everything past this point is not vocab-related
  391. if (hparams.vocab_only) {
  392. return;
  393. }
  394. ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
  395. ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
  396. ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
  397. ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
  398. ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
  399. if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
  400. ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
  401. ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
  402. ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
  403. ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
  404. ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
  405. }
  406. GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
  407. GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
  408. if (hparams.n_expert > 0) {
  409. GGML_ASSERT(hparams.n_expert_used > 0);
  410. } else {
  411. GGML_ASSERT(hparams.n_expert_used == 0);
  412. }
  413. std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
  414. std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
  415. std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
  416. std::fill(
  417. hparams.recurrent_layer_arr.begin(),
  418. hparams.recurrent_layer_arr.end(),
  419. llm_arch_is_recurrent(ml.get_arch()));
  420. std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
  421. std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
  422. ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
  423. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
  424. // n_head_kv is optional, default to n_head
  425. hparams.n_head_kv_arr = hparams.n_head_arr;
  426. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
  427. bool rope_finetuned = false;
  428. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  429. hparams.rope_finetuned = rope_finetuned;
  430. hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
  431. ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
  432. // rope_freq_base (optional)
  433. hparams.rope_freq_base_train = 10000.0f;
  434. ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
  435. std::string rope_scaling("linear");
  436. ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
  437. hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
  438. GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
  439. // rope_freq_scale (inverse of the kv) is optional
  440. float ropescale = 0.0f;
  441. if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
  442. // try the old key name
  443. ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
  444. }
  445. hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
  446. // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
  447. hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
  448. hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
  449. ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
  450. // non-transformer models do not have attention heads
  451. if (hparams.n_head() > 0) {
  452. // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
  453. // gpt-j n_rot = rotary_dim
  454. hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
  455. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
  456. hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
  457. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
  458. // sanity check for n_rot (optional)
  459. hparams.n_rot = hparams.n_embd_head_k;
  460. ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
  461. if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
  462. if (hparams.n_rot != hparams.n_embd_head_k) {
  463. throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
  464. }
  465. }
  466. } else {
  467. hparams.n_rot = 0;
  468. hparams.n_embd_head_k = 0;
  469. hparams.n_embd_head_v = 0;
  470. }
  471. // for differentiating model types
  472. uint32_t n_vocab = 0;
  473. ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
  474. // for classifier models
  475. ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
  476. if (!classifier_labels.empty()) {
  477. hparams.n_cls_out = classifier_labels.size();
  478. }
  479. // arch-specific KVs
  480. switch (arch) {
  481. case LLM_ARCH_LLAMA:
  482. {
  483. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  484. if (hparams.n_expert == 8) {
  485. switch (hparams.n_layer) {
  486. case 32: type = LLM_TYPE_8x7B; break;
  487. case 56: type = LLM_TYPE_8x22B; break;
  488. default: type = LLM_TYPE_UNKNOWN;
  489. }
  490. } else {
  491. switch (hparams.n_layer) {
  492. case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
  493. case 22: type = LLM_TYPE_1B; break;
  494. case 26: type = LLM_TYPE_3B; break;
  495. case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
  496. // granite uses a vocab with len 49152
  497. case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
  498. case 36: type = LLM_TYPE_8B; break; // granite
  499. case 40: type = LLM_TYPE_13B; break;
  500. case 48: type = LLM_TYPE_34B; break;
  501. case 60: type = LLM_TYPE_30B; break;
  502. case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
  503. default: type = LLM_TYPE_UNKNOWN;
  504. }
  505. }
  506. } break;
  507. case LLM_ARCH_LLAMA4:
  508. {
  509. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  510. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  511. ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
  512. hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
  513. hparams.n_swa = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
  514. hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
  515. switch (hparams.n_expert) {
  516. case 16: type = LLM_TYPE_17B_16E; break;
  517. case 128: type = LLM_TYPE_17B_128E; break;
  518. default: type = LLM_TYPE_UNKNOWN;
  519. }
  520. if (type == LLM_TYPE_17B_128E) {
  521. hparams.use_kq_norm = false;
  522. }
  523. } break;
  524. case LLM_ARCH_ARCEE:
  525. {
  526. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  527. // Arcee uses the same structure as Llama
  528. switch (hparams.n_layer) {
  529. case 36: type = LLM_TYPE_4B; break;
  530. default: type = LLM_TYPE_UNKNOWN;
  531. }
  532. } break;
  533. case LLM_ARCH_DECI:
  534. {
  535. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  536. switch (hparams.n_layer) {
  537. case 32: type = LLM_TYPE_7B; break;
  538. case 80: type = LLM_TYPE_70B; break;
  539. case 162: type = LLM_TYPE_405B; break;
  540. default: type = LLM_TYPE_UNKNOWN;
  541. }
  542. } break;
  543. case LLM_ARCH_MINICPM:
  544. {
  545. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  546. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  547. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  548. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  549. switch (hparams.n_layer) {
  550. case 52: type = LLM_TYPE_1B; break;
  551. case 40: type = LLM_TYPE_2B; break;
  552. default: type = LLM_TYPE_UNKNOWN;
  553. }
  554. } break;
  555. case LLM_ARCH_MINICPM3:
  556. {
  557. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  558. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  559. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  560. switch (hparams.n_layer) {
  561. case 62: type = LLM_TYPE_4B; break;
  562. default: type = LLM_TYPE_UNKNOWN;
  563. }
  564. } break;
  565. case LLM_ARCH_GROK:
  566. {
  567. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  568. switch (hparams.n_layer) {
  569. case 64: type = LLM_TYPE_314B; break;
  570. default: type = LLM_TYPE_UNKNOWN;
  571. }
  572. } break;
  573. case LLM_ARCH_FALCON:
  574. {
  575. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  576. switch (hparams.n_layer) {
  577. case 32: type = LLM_TYPE_7B; break;
  578. case 60: type = LLM_TYPE_40B; break;
  579. default: type = LLM_TYPE_UNKNOWN;
  580. }
  581. } break;
  582. case LLM_ARCH_BAICHUAN:
  583. {
  584. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  585. switch (hparams.n_layer) {
  586. case 32: type = LLM_TYPE_7B; break;
  587. case 40: type = LLM_TYPE_13B; break;
  588. default: type = LLM_TYPE_UNKNOWN;
  589. }
  590. if (type == LLM_TYPE_13B) {
  591. // TODO: become GGUF KV parameter
  592. hparams.f_max_alibi_bias = 8.0f;
  593. }
  594. } break;
  595. case LLM_ARCH_STARCODER:
  596. {
  597. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  598. switch (hparams.n_layer) {
  599. case 24: type = LLM_TYPE_1B; break;
  600. case 36: type = LLM_TYPE_3B; break;
  601. case 42: type = LLM_TYPE_7B; break;
  602. case 40: type = LLM_TYPE_15B; break;
  603. default: type = LLM_TYPE_UNKNOWN;
  604. }
  605. } break;
  606. case LLM_ARCH_REFACT:
  607. {
  608. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  609. switch (hparams.n_layer) {
  610. case 32: type = LLM_TYPE_1B; break;
  611. default: type = LLM_TYPE_UNKNOWN;
  612. }
  613. // TODO: become GGUF KV parameter
  614. hparams.f_max_alibi_bias = 8.0f;
  615. } break;
  616. case LLM_ARCH_BERT:
  617. {
  618. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  619. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  620. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  621. switch (hparams.n_layer) {
  622. case 3:
  623. type = LLM_TYPE_17M; break; // bge-micro
  624. case 6:
  625. type = LLM_TYPE_22M; break; // MiniLM-L6
  626. case 12:
  627. switch (hparams.n_embd) {
  628. case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
  629. case 768: type = LLM_TYPE_109M; break; // bge-base
  630. default: type = LLM_TYPE_UNKNOWN;
  631. } break;
  632. case 24:
  633. type = LLM_TYPE_335M; break; // bge-large
  634. default: type = LLM_TYPE_UNKNOWN;
  635. }
  636. } break;
  637. case LLM_ARCH_JINA_BERT_V2:
  638. {
  639. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  640. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  641. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  642. hparams.f_max_alibi_bias = 8.0f;
  643. switch (hparams.n_layer) {
  644. case 4: type = LLM_TYPE_33M; break; // jina-embeddings-small
  645. case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
  646. default: type = LLM_TYPE_UNKNOWN;
  647. }
  648. } break;
  649. case LLM_ARCH_NOMIC_BERT:
  650. case LLM_ARCH_NOMIC_BERT_MOE:
  651. {
  652. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  653. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  654. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  655. ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
  656. if (hparams.n_layer == 12 && hparams.n_embd == 768) {
  657. if (arch == LLM_ARCH_NOMIC_BERT) {
  658. type = LLM_TYPE_137M;
  659. } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
  660. type = LLM_TYPE_475M;
  661. }
  662. }
  663. } break;
  664. case LLM_ARCH_NEO_BERT:
  665. {
  666. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  667. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  668. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  669. if (hparams.n_layer == 28) {
  670. type = LLM_TYPE_250M;
  671. }
  672. } break;
  673. case LLM_ARCH_BLOOM:
  674. {
  675. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  676. switch (hparams.n_layer) {
  677. case 24: type = LLM_TYPE_1B; break;
  678. case 30:
  679. switch (hparams.n_embd) {
  680. case 2560: type = LLM_TYPE_3B; break;
  681. case 4096: type = LLM_TYPE_7B; break;
  682. default: type = LLM_TYPE_UNKNOWN;
  683. } break;
  684. default: type = LLM_TYPE_UNKNOWN;
  685. }
  686. // TODO: become GGUF KV parameter
  687. hparams.f_max_alibi_bias = 8.0f;
  688. } break;
  689. case LLM_ARCH_MPT:
  690. {
  691. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  692. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  693. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  694. switch (hparams.n_layer) {
  695. case 32: type = LLM_TYPE_7B; break;
  696. case 48: type = LLM_TYPE_30B; break;
  697. default: type = LLM_TYPE_UNKNOWN;
  698. }
  699. } break;
  700. case LLM_ARCH_STABLELM:
  701. {
  702. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  703. switch (hparams.n_layer) {
  704. case 24: type = LLM_TYPE_1B; break;
  705. case 32: type = LLM_TYPE_3B; break;
  706. case 40: type = LLM_TYPE_12B; break;
  707. default: type = LLM_TYPE_UNKNOWN;
  708. }
  709. } break;
  710. case LLM_ARCH_QWEN:
  711. {
  712. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  713. switch (hparams.n_layer) {
  714. case 32: type = LLM_TYPE_7B; break;
  715. case 40: type = LLM_TYPE_13B; break;
  716. default: type = LLM_TYPE_UNKNOWN;
  717. }
  718. } break;
  719. case LLM_ARCH_QWEN2VL:
  720. {
  721. ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
  722. }
  723. // fall through
  724. case LLM_ARCH_QWEN2:
  725. {
  726. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  727. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  728. switch (hparams.n_layer) {
  729. case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
  730. case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
  731. case 32: type = LLM_TYPE_7B; break;
  732. case 36: type = LLM_TYPE_3B; break;
  733. case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
  734. case 48: type = LLM_TYPE_14B; break;
  735. case 64: type = LLM_TYPE_32B; break;
  736. case 80: type = LLM_TYPE_70B; break;
  737. default: type = LLM_TYPE_UNKNOWN;
  738. }
  739. } break;
  740. case LLM_ARCH_QWEN2MOE:
  741. {
  742. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  743. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  744. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  745. switch (hparams.n_layer) {
  746. case 24: type = LLM_TYPE_A2_7B; break;
  747. case 28: type = LLM_TYPE_57B_A14B; break;
  748. default: type = LLM_TYPE_UNKNOWN;
  749. }
  750. } break;
  751. case LLM_ARCH_QWEN3:
  752. {
  753. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  754. switch (hparams.n_layer) {
  755. case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
  756. case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
  757. case 40: type = LLM_TYPE_14B; break;
  758. case 64: type = LLM_TYPE_32B; break;
  759. default: type = LLM_TYPE_UNKNOWN;
  760. }
  761. } break;
  762. case LLM_ARCH_QWEN3MOE:
  763. {
  764. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  765. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  766. switch (hparams.n_layer) {
  767. case 48: type = LLM_TYPE_30B_A3B; break;
  768. case 94: type = LLM_TYPE_235B_A22B; break;
  769. default: type = LLM_TYPE_UNKNOWN;
  770. }
  771. } break;
  772. case LLM_ARCH_PHI2:
  773. {
  774. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  775. switch (hparams.n_layer) {
  776. case 24: type = LLM_TYPE_1B; break;
  777. case 32: type = LLM_TYPE_3B; break;
  778. default: type = LLM_TYPE_UNKNOWN;
  779. }
  780. } break;
  781. case LLM_ARCH_PHI3:
  782. {
  783. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  784. switch (hparams.n_layer) {
  785. case 24: type = LLM_TYPE_1B; break;
  786. case 32: type = LLM_TYPE_3B; break;
  787. case 40: type = LLM_TYPE_14B; break;
  788. default: type = LLM_TYPE_UNKNOWN;
  789. }
  790. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  791. if (found_swa && hparams.n_swa > 0) {
  792. LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
  793. __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
  794. // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
  795. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  796. hparams.n_swa = 0;
  797. hparams.set_swa_pattern(1);
  798. }
  799. } break;
  800. case LLM_ARCH_PHIMOE:
  801. {
  802. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  803. switch (hparams.n_layer) {
  804. case 32: type = LLM_TYPE_16x3_8B; break;
  805. default: type = LLM_TYPE_UNKNOWN;
  806. }
  807. } break;
  808. case LLM_ARCH_PLAMO:
  809. {
  810. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  811. switch (hparams.n_layer) {
  812. case 40: type = LLM_TYPE_13B; break;
  813. default: type = LLM_TYPE_UNKNOWN;
  814. }
  815. } break;
  816. case LLM_ARCH_GPT2:
  817. {
  818. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  819. switch (hparams.n_layer) {
  820. case 12: type = LLM_TYPE_SMALL; break;
  821. case 24: type = LLM_TYPE_MEDIUM; break;
  822. case 36: type = LLM_TYPE_LARGE; break;
  823. case 48: type = LLM_TYPE_XL; break;
  824. default: type = LLM_TYPE_UNKNOWN;
  825. }
  826. } break;
  827. case LLM_ARCH_CODESHELL:
  828. {
  829. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  830. switch (hparams.n_layer) {
  831. case 42: type = LLM_TYPE_7B; break;
  832. default: type = LLM_TYPE_UNKNOWN;
  833. }
  834. } break;
  835. case LLM_ARCH_ORION:
  836. {
  837. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  838. switch (hparams.n_layer) {
  839. case 40: type = LLM_TYPE_14B; break;
  840. default: type = LLM_TYPE_UNKNOWN;
  841. }
  842. } break;
  843. case LLM_ARCH_INTERNLM2:
  844. {
  845. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  846. switch (hparams.n_layer) {
  847. case 32: type = LLM_TYPE_7B; break;
  848. case 48: type = LLM_TYPE_20B; break;
  849. default: type = LLM_TYPE_UNKNOWN;
  850. }
  851. } break;
  852. case LLM_ARCH_GEMMA:
  853. {
  854. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  855. switch (hparams.n_layer) {
  856. case 18: type = LLM_TYPE_2B; break;
  857. case 28: type = LLM_TYPE_7B; break;
  858. default: type = LLM_TYPE_UNKNOWN;
  859. }
  860. } break;
  861. case LLM_ARCH_GEMMA2:
  862. {
  863. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  864. hparams.n_swa = 4096; // default value of gemma 2
  865. hparams.set_swa_pattern(2);
  866. hparams.attn_soft_cap = true;
  867. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  868. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  869. ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
  870. ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
  871. switch (hparams.n_layer) {
  872. case 26: type = LLM_TYPE_2B; break;
  873. case 42: type = LLM_TYPE_9B; break;
  874. case 46: type = LLM_TYPE_27B; break;
  875. default: type = LLM_TYPE_UNKNOWN;
  876. }
  877. // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
  878. hparams.f_attention_scale = type == LLM_TYPE_27B
  879. ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
  880. : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
  881. } break;
  882. case LLM_ARCH_GEMMA3:
  883. {
  884. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  885. hparams.set_swa_pattern(6);
  886. hparams.rope_freq_base_train_swa = 10000.0f;
  887. hparams.rope_freq_scale_train_swa = 1.0f;
  888. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  889. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  890. switch (hparams.n_layer) {
  891. case 26: type = LLM_TYPE_1B; break;
  892. case 34: type = LLM_TYPE_4B; break;
  893. case 48: type = LLM_TYPE_12B; break;
  894. case 62: type = LLM_TYPE_27B; break;
  895. default: type = LLM_TYPE_UNKNOWN;
  896. }
  897. // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
  898. hparams.f_attention_scale = type == LLM_TYPE_27B
  899. ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
  900. : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
  901. } break;
  902. case LLM_ARCH_STARCODER2:
  903. {
  904. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  905. switch (hparams.n_layer) {
  906. case 30: type = LLM_TYPE_3B; break;
  907. case 32: type = LLM_TYPE_7B; break;
  908. case 40: type = LLM_TYPE_15B; break;
  909. case 52: type = LLM_TYPE_20B; break; // granite
  910. case 88: type = LLM_TYPE_34B; break; // granite
  911. default: type = LLM_TYPE_UNKNOWN;
  912. }
  913. } break;
  914. case LLM_ARCH_MAMBA:
  915. {
  916. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  917. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  918. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  919. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  920. ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
  921. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  922. switch (hparams.n_layer) {
  923. case 24:
  924. switch (hparams.n_embd) {
  925. case 768: type = LLM_TYPE_SMALL; break;
  926. default: type = LLM_TYPE_UNKNOWN;
  927. } break;
  928. case 48:
  929. switch (hparams.n_embd) {
  930. case 1024: type = LLM_TYPE_MEDIUM; break;
  931. case 1536: type = LLM_TYPE_LARGE; break;
  932. case 2048: type = LLM_TYPE_XL; break;
  933. default: type = LLM_TYPE_UNKNOWN;
  934. } break;
  935. case 64:
  936. switch (hparams.n_embd) {
  937. case 2560: type = LLM_TYPE_3B; break;
  938. default: type = LLM_TYPE_UNKNOWN;
  939. } break;
  940. default: type = LLM_TYPE_UNKNOWN;
  941. }
  942. } break;
  943. case LLM_ARCH_XVERSE:
  944. {
  945. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  946. switch (hparams.n_layer) {
  947. case 32: type = LLM_TYPE_7B; break;
  948. case 40: type = LLM_TYPE_13B; break;
  949. case 80: type = LLM_TYPE_65B; break;
  950. default: type = LLM_TYPE_UNKNOWN;
  951. }
  952. } break;
  953. case LLM_ARCH_COMMAND_R:
  954. {
  955. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  956. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  957. switch (hparams.n_layer) {
  958. case 40: type = LLM_TYPE_35B; break;
  959. default: type = LLM_TYPE_UNKNOWN;
  960. }
  961. } break;
  962. case LLM_ARCH_COHERE2:
  963. {
  964. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  965. hparams.set_swa_pattern(4);
  966. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  967. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  968. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  969. switch (hparams.n_layer) {
  970. case 32: type = LLM_TYPE_8B; break;
  971. default: type = LLM_TYPE_UNKNOWN;
  972. }
  973. } break;
  974. case LLM_ARCH_DBRX:
  975. {
  976. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  977. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
  978. switch (hparams.n_layer) {
  979. case 40: type = LLM_TYPE_16x12B; break;
  980. default: type = LLM_TYPE_UNKNOWN;
  981. }
  982. } break;
  983. case LLM_ARCH_OLMO:
  984. {
  985. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  986. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  987. switch (hparams.n_layer) {
  988. case 22: type = LLM_TYPE_1B; break;
  989. case 32: type = LLM_TYPE_7B; break;
  990. case 80: type = LLM_TYPE_70B; break;
  991. default: type = LLM_TYPE_UNKNOWN;
  992. }
  993. } break;
  994. case LLM_ARCH_OLMO2:
  995. {
  996. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  997. switch (hparams.n_layer) {
  998. case 16: type = LLM_TYPE_1B; break;
  999. case 32: type = LLM_TYPE_7B; break;
  1000. case 40: type = LLM_TYPE_13B; break;
  1001. case 64: type = LLM_TYPE_32B; break;
  1002. default: type = LLM_TYPE_UNKNOWN;
  1003. }
  1004. } break;
  1005. case LLM_ARCH_OLMOE:
  1006. {
  1007. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1008. switch (hparams.n_layer) {
  1009. case 16: type = LLM_TYPE_A1_7B; break;
  1010. default: type = LLM_TYPE_UNKNOWN;
  1011. }
  1012. } break;
  1013. case LLM_ARCH_OPENELM:
  1014. {
  1015. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1016. switch (hparams.n_layer) {
  1017. case 16: type = LLM_TYPE_270M; break;
  1018. case 20: type = LLM_TYPE_450M; break;
  1019. case 28: type = LLM_TYPE_1B; break;
  1020. case 36: type = LLM_TYPE_3B; break;
  1021. default: type = LLM_TYPE_UNKNOWN;
  1022. }
  1023. } break;
  1024. case LLM_ARCH_GPTNEOX:
  1025. {
  1026. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1027. ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
  1028. switch (hparams.n_layer) {
  1029. case 6:
  1030. switch (hparams.n_ff()) {
  1031. case 512: type = LLM_TYPE_14M; break;
  1032. case 2048: type = LLM_TYPE_70M; break;
  1033. default: type = LLM_TYPE_UNKNOWN;
  1034. } break;
  1035. case 12:
  1036. switch (hparams.n_ff()) {
  1037. case 3072: type = LLM_TYPE_160M; break;
  1038. default: type = LLM_TYPE_UNKNOWN;
  1039. } break;
  1040. case 16:
  1041. switch (hparams.n_ff()) {
  1042. case 8192: type = LLM_TYPE_1B; break;
  1043. default: type = LLM_TYPE_UNKNOWN;
  1044. } break;
  1045. case 24:
  1046. switch (hparams.n_ff()) {
  1047. case 4096: type = LLM_TYPE_410M; break;
  1048. case 8192: type = LLM_TYPE_1_4B; break;
  1049. default: type = LLM_TYPE_UNKNOWN;
  1050. } break;
  1051. case 32:
  1052. switch (hparams.n_ff()) {
  1053. case 10240: type = LLM_TYPE_2_8B; break;
  1054. case 16384: type = LLM_TYPE_6_9B; break;
  1055. default: type = LLM_TYPE_UNKNOWN;
  1056. } break;
  1057. case 36:
  1058. switch (hparams.n_ff()) {
  1059. case 20480: type = LLM_TYPE_12B; break;
  1060. default: type = LLM_TYPE_UNKNOWN;
  1061. } break;
  1062. case 44:
  1063. switch (hparams.n_ff()) {
  1064. case 24576: type = LLM_TYPE_20B; break;
  1065. default: type = LLM_TYPE_UNKNOWN;
  1066. } break;
  1067. default: type = LLM_TYPE_UNKNOWN;
  1068. }
  1069. } break;
  1070. case LLM_ARCH_ARCTIC:
  1071. {
  1072. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1073. if (hparams.n_expert == 128) {
  1074. switch (hparams.n_layer) {
  1075. case 35: type = LLM_TYPE_10B_128x3_66B; break;
  1076. default: type = LLM_TYPE_UNKNOWN;
  1077. }
  1078. } else {
  1079. type = LLM_TYPE_UNKNOWN;
  1080. }
  1081. } break;
  1082. case LLM_ARCH_DEEPSEEK:
  1083. {
  1084. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1085. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1086. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1087. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1088. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1089. switch (hparams.n_layer) {
  1090. case 28: type = LLM_TYPE_20B; break;
  1091. default: type = LLM_TYPE_UNKNOWN;
  1092. }
  1093. } break;
  1094. case LLM_ARCH_DEEPSEEK2:
  1095. {
  1096. bool is_lite = (hparams.n_layer == 27);
  1097. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1098. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1099. if (!is_lite) {
  1100. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  1101. }
  1102. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  1103. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
  1104. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
  1105. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1106. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1107. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1108. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1109. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1110. if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
  1111. // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
  1112. // that have no expert_gating_func model parameter set
  1113. hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
  1114. }
  1115. ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
  1116. switch (hparams.n_layer) {
  1117. case 27: type = LLM_TYPE_16B; break;
  1118. case 60: type = LLM_TYPE_236B; break;
  1119. case 61: type = LLM_TYPE_671B; break;
  1120. default: type = LLM_TYPE_UNKNOWN;
  1121. }
  1122. } break;
  1123. case LLM_ARCH_PLM:
  1124. {
  1125. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1126. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  1127. switch (hparams.n_layer) {
  1128. case 32: type = LLM_TYPE_1_8B; break;
  1129. default: type = LLM_TYPE_UNKNOWN;
  1130. }
  1131. } break;
  1132. case LLM_ARCH_CHATGLM:
  1133. {
  1134. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1135. switch (hparams.n_layer) {
  1136. case 28: {
  1137. if (hparams.n_head(0) == 16) {
  1138. type = LLM_TYPE_1_5B;
  1139. } else {
  1140. type = LLM_TYPE_6B;
  1141. }
  1142. } break;
  1143. case 40: {
  1144. if (hparams.n_head(0) == 24) {
  1145. type = LLM_TYPE_4B;
  1146. } else {
  1147. type = LLM_TYPE_9B;
  1148. }
  1149. } break;
  1150. default: type = LLM_TYPE_UNKNOWN;
  1151. }
  1152. } break;
  1153. case LLM_ARCH_GLM4:
  1154. {
  1155. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1156. switch (hparams.n_layer) {
  1157. case 40: type = LLM_TYPE_9B; break;
  1158. case 61: type = LLM_TYPE_32B; break;
  1159. default: type = LLM_TYPE_UNKNOWN;
  1160. }
  1161. } break;
  1162. case LLM_ARCH_BITNET:
  1163. {
  1164. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1165. switch (hparams.n_layer) {
  1166. case 26: type = LLM_TYPE_3B; break;
  1167. default: type = LLM_TYPE_UNKNOWN;
  1168. }
  1169. } break;
  1170. case LLM_ARCH_T5:
  1171. {
  1172. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1173. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  1174. uint32_t dec_start_token_id;
  1175. if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
  1176. hparams.dec_start_token_id = dec_start_token_id;
  1177. }
  1178. switch (hparams.n_layer) {
  1179. case 6: type = LLM_TYPE_60M; break; // t5-small
  1180. case 8: type = LLM_TYPE_80M; break; // flan-t5-small
  1181. case 12:
  1182. switch (hparams.n_ff()) {
  1183. case 3072: type = LLM_TYPE_220M; break; // t5-base
  1184. case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
  1185. default: type = LLM_TYPE_UNKNOWN;
  1186. } break;
  1187. case 24:
  1188. switch (hparams.n_ff()) {
  1189. case 4096: type = LLM_TYPE_770M; break; // t5-large
  1190. case 2816: type = LLM_TYPE_780M; break; // flan-t5-large
  1191. case 16384: type = LLM_TYPE_3B; break; // t5-3b
  1192. case 5120: type = LLM_TYPE_3B; break; // flan-t5-xl
  1193. case 65536: type = LLM_TYPE_11B; break; // t5-11b
  1194. case 10240: type = LLM_TYPE_11B; break; // flan-t5-xxl
  1195. default: type = LLM_TYPE_UNKNOWN;
  1196. } break;
  1197. default: type = LLM_TYPE_UNKNOWN;
  1198. }
  1199. } break;
  1200. case LLM_ARCH_T5ENCODER:
  1201. {
  1202. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1203. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  1204. type = LLM_TYPE_UNKNOWN;
  1205. } break;
  1206. case LLM_ARCH_JAIS:
  1207. {
  1208. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1209. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  1210. switch (hparams.n_layer) {
  1211. case 24: type = LLM_TYPE_1_3B; break;
  1212. case 40: type = LLM_TYPE_13B; break;
  1213. /* TODO: add variants */
  1214. default: type = LLM_TYPE_UNKNOWN;
  1215. }
  1216. } break;
  1217. case LLM_ARCH_NEMOTRON:
  1218. {
  1219. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1220. switch (hparams.n_layer) {
  1221. case 32: type = LLM_TYPE_4B; break;
  1222. default: type = LLM_TYPE_UNKNOWN;
  1223. }
  1224. } break;
  1225. case LLM_ARCH_EXAONE:
  1226. {
  1227. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1228. switch (hparams.n_layer) {
  1229. case 32: type = LLM_TYPE_8B; break;
  1230. default: type = LLM_TYPE_UNKNOWN;
  1231. }
  1232. } break;
  1233. case LLM_ARCH_RWKV6:
  1234. case LLM_ARCH_RWKV6QWEN2:
  1235. {
  1236. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
  1237. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
  1238. ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
  1239. ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
  1240. ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
  1241. ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
  1242. ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
  1243. switch (hparams.n_layer) {
  1244. case 24: type = LLM_TYPE_1_6B; break;
  1245. case 32:
  1246. switch (hparams.n_embd) {
  1247. case 2560: type = LLM_TYPE_3B; break;
  1248. case 4096: type = LLM_TYPE_7B; break;
  1249. default: type = LLM_TYPE_UNKNOWN;
  1250. } break;
  1251. case 61: type = LLM_TYPE_14B; break;
  1252. case 64: type = LLM_TYPE_32B; break;
  1253. default: type = LLM_TYPE_UNKNOWN;
  1254. }
  1255. } break;
  1256. case LLM_ARCH_RWKV7:
  1257. case LLM_ARCH_ARWKV7:
  1258. {
  1259. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
  1260. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
  1261. ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
  1262. ml.get_key(LLM_KV_ATTENTION_DECAY_LORA_RANK, hparams.n_lora_decay);
  1263. ml.get_key(LLM_KV_ATTENTION_ICLR_LORA_RANK, hparams.n_lora_iclr);
  1264. ml.get_key(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix);
  1265. ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate, false);
  1266. ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
  1267. switch (hparams.n_layer) {
  1268. case 12: type = LLM_TYPE_190M; break;
  1269. case 24:
  1270. switch (hparams.n_embd) {
  1271. case 1024: type = LLM_TYPE_450M; break;
  1272. case 2048: type = LLM_TYPE_1_5B; break;
  1273. default: type = LLM_TYPE_UNKNOWN;
  1274. } break;
  1275. case 28:
  1276. switch (hparams.n_embd) {
  1277. case 1536: type = LLM_TYPE_1_5B; break;
  1278. case 3584: type = LLM_TYPE_7B; break;
  1279. default: type = LLM_TYPE_UNKNOWN;
  1280. } break;
  1281. case 32: type = LLM_TYPE_2_9B; break; // RWKV-7-World
  1282. default: type = LLM_TYPE_UNKNOWN;
  1283. }
  1284. } break;
  1285. case LLM_ARCH_GRANITE:
  1286. case LLM_ARCH_GRANITE_MOE:
  1287. {
  1288. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1289. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1290. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  1291. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  1292. ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
  1293. switch (hparams.n_layer) {
  1294. case 32: type = LLM_TYPE_3B; break;
  1295. case 40: type = LLM_TYPE_3B; break;
  1296. // Add additional layer/vocab/etc checks here for other model sizes
  1297. default: type = LLM_TYPE_UNKNOWN;
  1298. }
  1299. // For Granite MoE Shared
  1300. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
  1301. } break;
  1302. case LLM_ARCH_CHAMELEON:
  1303. {
  1304. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1305. hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
  1306. ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
  1307. switch (hparams.n_layer) {
  1308. case 32: type = LLM_TYPE_7B; break;
  1309. case 48: type = LLM_TYPE_34B; break;
  1310. default: type = LLM_TYPE_UNKNOWN;
  1311. }
  1312. } break;
  1313. case LLM_ARCH_WAVTOKENIZER_DEC:
  1314. {
  1315. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1316. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
  1317. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
  1318. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  1319. } break;
  1320. case LLM_ARCH_BAILINGMOE:
  1321. {
  1322. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1323. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1324. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1325. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1326. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1327. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1328. switch (hparams.n_layer) {
  1329. case 28: type = LLM_TYPE_16B; break;
  1330. case 88: type = LLM_TYPE_290B; break;
  1331. default: type = LLM_TYPE_UNKNOWN;
  1332. }
  1333. } break;
  1334. case LLM_ARCH_DOTS1:
  1335. {
  1336. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1337. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1338. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1339. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1340. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1341. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1342. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1343. switch (hparams.n_layer) {
  1344. case 62: type = LLM_TYPE_142B; break;
  1345. default: type = LLM_TYPE_UNKNOWN;
  1346. }
  1347. } break;
  1348. default: throw std::runtime_error("unsupported model architecture");
  1349. }
  1350. pimpl->n_bytes = ml.n_bytes;
  1351. pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
  1352. if (hparams.f_max_alibi_bias > 0.0f) {
  1353. hparams.use_alibi = true;
  1354. }
  1355. hparams.rope_type = llama_model_rope_type(this);
  1356. }
  1357. void llama_model::load_vocab(llama_model_loader & ml) {
  1358. const auto kv = LLM_KV(arch);
  1359. vocab.load(ml, kv);
  1360. }
  1361. bool llama_model::load_tensors(llama_model_loader & ml) {
  1362. const auto & split_mode = params.split_mode;
  1363. const auto & n_gpu_layers = params.n_gpu_layers;
  1364. const auto & use_mlock = params.use_mlock;
  1365. const auto & tensor_split = params.tensor_split;
  1366. const int n_layer = hparams.n_layer;
  1367. const bool use_mmap_buffer = true;
  1368. LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
  1369. // build a list of buffer types for the CPU and GPU devices
  1370. pimpl->cpu_buft_list = make_cpu_buft_list(devices);
  1371. for (auto * dev : devices) {
  1372. buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
  1373. // add CPU buffer types as a fallback
  1374. buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
  1375. pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
  1376. }
  1377. // calculate the split points
  1378. bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
  1379. std::vector<float> splits(n_devices());
  1380. if (all_zero) {
  1381. // default split, by free memory
  1382. for (size_t i = 0; i < n_devices(); ++i) {
  1383. ggml_backend_dev_t dev = devices[i];
  1384. size_t total;
  1385. size_t free;
  1386. ggml_backend_dev_memory(dev, &free, &total);
  1387. splits[i] = free;
  1388. }
  1389. } else {
  1390. std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
  1391. }
  1392. // sum and normalize the splits to get the split points
  1393. float split_sum = 0.0f;
  1394. for (size_t i = 0; i < n_devices(); ++i) {
  1395. split_sum += splits[i];
  1396. splits[i] = split_sum;
  1397. }
  1398. for (size_t i = 0; i < n_devices(); ++i) {
  1399. splits[i] /= split_sum;
  1400. }
  1401. ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  1402. if (cpu_dev == nullptr) {
  1403. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  1404. }
  1405. const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
  1406. const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
  1407. auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
  1408. const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
  1409. if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
  1410. LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
  1411. return {cpu_dev, &pimpl->cpu_buft_list};
  1412. }
  1413. const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
  1414. auto * dev = devices.at(layer_gpu);
  1415. LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
  1416. return {dev, &pimpl->gpu_buft_list.at(dev)};
  1417. };
  1418. // assign the input layer
  1419. // there is very little benefit to offloading the input layer, so always keep it on the CPU
  1420. pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
  1421. // assign the repeating layers to the devices according to the splits
  1422. pimpl->dev_layer.resize(n_layer);
  1423. for (int il = 0; il < n_layer; ++il) {
  1424. pimpl->dev_layer[il] = get_layer_buft_list(il);
  1425. }
  1426. // assign the output layer
  1427. pimpl->dev_output = get_layer_buft_list(n_layer);
  1428. // one ggml context per buffer type
  1429. int max_n_tensors = ml.n_tensors;
  1430. max_n_tensors += 1; // duplicated output tensor
  1431. max_n_tensors += n_layer*2; // duplicated rope freq tensors
  1432. const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
  1433. std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
  1434. auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
  1435. auto it = ctx_map.find(buft);
  1436. if (it == ctx_map.end()) {
  1437. ggml_init_params params = {
  1438. /*.mem_size =*/ ctx_size,
  1439. /*.mem_buffer =*/ NULL,
  1440. /*.no_alloc =*/ true,
  1441. };
  1442. ggml_context * ctx = ggml_init(params);
  1443. if (!ctx) {
  1444. throw std::runtime_error(format("failed to create ggml context"));
  1445. }
  1446. ctx_map[buft] = ctx;
  1447. pimpl->ctxs.emplace_back(ctx);
  1448. return ctx;
  1449. }
  1450. return it->second;
  1451. };
  1452. const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
  1453. const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
  1454. // create tensors for the weights
  1455. {
  1456. // note: cast to int64_t since we will use these for the tensor dimensions
  1457. const int64_t n_head = hparams.n_head();
  1458. const int64_t n_head_kv = hparams.n_head_kv();
  1459. const int64_t n_embd = hparams.n_embd;
  1460. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
  1461. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
  1462. const int64_t n_embd_head_k = hparams.n_embd_head_k;
  1463. const int64_t n_embd_head_v = hparams.n_embd_head_v;
  1464. const int64_t n_ff = hparams.n_ff();
  1465. const int64_t n_embd_gqa = n_embd_v_gqa;
  1466. const int64_t n_vocab = vocab.n_tokens();
  1467. const int64_t n_token_types = vocab.n_token_types();
  1468. const int64_t n_rot = hparams.n_rot;
  1469. const int64_t n_expert = hparams.n_expert;
  1470. const int64_t n_expert_used = hparams.n_expert_used;
  1471. const int64_t n_ctx_train = hparams.n_ctx_train;
  1472. if (n_expert > 0 && hparams.n_expert_used == 0) {
  1473. throw std::runtime_error("model has expert layers but no expert layers are used");
  1474. }
  1475. int n_moved_tensors = 0;
  1476. ggml_tensor * first_moved_tensor = nullptr;
  1477. ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
  1478. ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
  1479. auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
  1480. ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
  1481. if (!t_meta) {
  1482. if (flags & TENSOR_NOT_REQUIRED) {
  1483. return nullptr;
  1484. }
  1485. throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
  1486. }
  1487. // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
  1488. // the tensor is duplicated
  1489. // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
  1490. llm_tensor tn_tensor = tn.tensor;
  1491. if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
  1492. tn_tensor = LLM_TENSOR_OUTPUT;
  1493. }
  1494. llm_tensor_info info;
  1495. try {
  1496. info = llm_tensor_info_for(tn_tensor);
  1497. } catch (const std::out_of_range & e) {
  1498. throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
  1499. }
  1500. // skip unused tensors
  1501. if (info.op == GGML_OP_NONE) {
  1502. const size_t nbytes = ggml_nbytes(t_meta);
  1503. LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
  1504. ml.size_data -= nbytes;
  1505. ml.n_created++;
  1506. return nullptr;
  1507. }
  1508. // tensors with "bias" suffix are always used with GGML_OP_ADD
  1509. ggml_op op;
  1510. bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
  1511. if (bias) {
  1512. op = GGML_OP_ADD;
  1513. } else {
  1514. op = info.op;
  1515. }
  1516. // sanity checks
  1517. if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
  1518. if (tn.bid != -1) {
  1519. GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
  1520. }
  1521. } else {
  1522. if (tn.bid == -1) {
  1523. GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
  1524. }
  1525. }
  1526. // select the buffer type for this tensor
  1527. buft_list_t * buft_list;
  1528. switch (info.layer) {
  1529. case LLM_TENSOR_LAYER_INPUT:
  1530. buft_list = pimpl->dev_input.buft_list;
  1531. break;
  1532. case LLM_TENSOR_LAYER_OUTPUT:
  1533. buft_list = pimpl->dev_output.buft_list;
  1534. break;
  1535. case LLM_TENSOR_LAYER_REPEATING:
  1536. buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
  1537. break;
  1538. default:
  1539. GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
  1540. }
  1541. ggml_backend_buffer_type_t buft = nullptr;
  1542. // check overrides
  1543. if (ml.tensor_buft_overrides) {
  1544. std::string tensor_name = tn.str();
  1545. for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
  1546. std::regex pattern(overrides->pattern);
  1547. if (std::regex_search(tensor_name, pattern)) {
  1548. buft = overrides->buft;
  1549. LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
  1550. tensor_name.c_str(),
  1551. ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
  1552. ggml_backend_buft_name(buft));
  1553. break;
  1554. }
  1555. }
  1556. }
  1557. if (!buft) {
  1558. buft = select_weight_buft(hparams, t_meta, op, *buft_list);
  1559. if (!buft) {
  1560. throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
  1561. }
  1562. }
  1563. // avoid using a host buffer when using mmap
  1564. auto * buft_dev = ggml_backend_buft_get_device(buft);
  1565. if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
  1566. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  1567. if (!cpu_dev) {
  1568. throw std::runtime_error("no CPU backend found");
  1569. }
  1570. buft = ggml_backend_dev_buffer_type(cpu_dev);
  1571. }
  1572. if (buft != buft_list->front().second) {
  1573. n_moved_tensors++;
  1574. if (!first_moved_tensor) {
  1575. first_moved_tensor = t_meta;
  1576. first_moved_from_buft = buft_list->front().second;
  1577. first_moved_to_buft = buft;
  1578. }
  1579. }
  1580. ggml_context * ctx = ctx_for_buft(buft);
  1581. // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
  1582. if (flags & TENSOR_DUPLICATED) {
  1583. ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
  1584. if (t) {
  1585. return t;
  1586. }
  1587. }
  1588. return ml.create_tensor(ctx, tn, ne, flags);
  1589. };
  1590. layers.resize(n_layer);
  1591. // TODO: move to a separate function
  1592. const auto tn = LLM_TN(arch);
  1593. switch (arch) {
  1594. case LLM_ARCH_LLAMA:
  1595. case LLM_ARCH_REFACT:
  1596. case LLM_ARCH_MINICPM:
  1597. case LLM_ARCH_GRANITE:
  1598. case LLM_ARCH_GRANITE_MOE:
  1599. {
  1600. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1601. // output
  1602. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1603. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1604. // if output is NULL, init from the input tok embed
  1605. if (output == NULL) {
  1606. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1607. }
  1608. for (int i = 0; i < n_layer; ++i) {
  1609. auto & layer = layers[i];
  1610. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1611. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  1612. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1613. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  1614. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  1615. // optional bias tensors
  1616. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1617. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1618. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1619. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1620. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1621. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  1622. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1623. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1624. }
  1625. else {
  1626. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1627. }
  1628. if (n_expert == 0) {
  1629. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1630. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1631. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1632. // optional MLP bias
  1633. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  1634. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1635. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  1636. } else {
  1637. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  1638. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
  1639. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  1640. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  1641. // For Granite MoE Shared
  1642. if (hparams.n_ff_shexp > 0) {
  1643. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  1644. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  1645. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
  1646. }
  1647. }
  1648. }
  1649. } break;
  1650. case LLM_ARCH_LLAMA4:
  1651. {
  1652. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1653. // output
  1654. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1655. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1656. // if output is NULL, init from the input tok embed
  1657. if (output == NULL) {
  1658. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1659. }
  1660. GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0");
  1661. for (int i = 0; i < n_layer; ++i) {
  1662. bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0;
  1663. auto & layer = layers[i];
  1664. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1665. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  1666. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1667. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  1668. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  1669. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1670. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1671. if (is_moe_layer) {
  1672. int n_ff_exp = hparams.n_ff_exp;
  1673. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  1674. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  1675. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
  1676. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  1677. // Shared expert
  1678. const int64_t n_ff_shexp = n_ff_exp;
  1679. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  1680. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
  1681. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  1682. } else {
  1683. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1684. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1685. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1686. }
  1687. }
  1688. } break;
  1689. case LLM_ARCH_DECI:
  1690. {
  1691. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1692. // output
  1693. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1694. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1695. // if output is NULL, init from the input tok embed
  1696. if (output == NULL) {
  1697. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1698. }
  1699. for (int i = 0; i < n_layer; ++i) {
  1700. auto & layer = layers[i];
  1701. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
  1702. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
  1703. const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
  1704. const int64_t n_ff = hparams.n_ff(i);
  1705. const int64_t n_head = hparams.n_head(i);
  1706. const int64_t n_head_kv = hparams.n_head_kv(i);
  1707. if (n_head_kv == 0 && n_head > 0) {
  1708. // linear attention for DeciLMCausalModel
  1709. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1710. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1711. }
  1712. else if (n_head_kv > 0) {
  1713. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1714. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  1715. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1716. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  1717. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  1718. }
  1719. // optional bias tensors
  1720. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1721. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1722. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1723. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1724. if (n_ff > 0) {
  1725. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1726. }
  1727. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  1728. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1729. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1730. }
  1731. else {
  1732. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1733. }
  1734. if (n_ff > 0) {
  1735. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1736. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1737. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1738. }
  1739. // optional MLP bias
  1740. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  1741. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1742. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  1743. }
  1744. } break;
  1745. case LLM_ARCH_MINICPM3:
  1746. {
  1747. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  1748. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  1749. const int64_t q_lora_rank = hparams.n_lora_q;
  1750. const int64_t kv_lora_rank = hparams.n_lora_kv;
  1751. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1752. // output
  1753. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1754. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1755. // if output is NULL, init from the input tok embed
  1756. if (output == NULL) {
  1757. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1758. }
  1759. for (int i = 0; i < n_layer; ++i) {
  1760. auto & layer = layers[i];
  1761. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1762. layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
  1763. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  1764. layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
  1765. layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
  1766. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
  1767. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
  1768. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
  1769. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1770. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1771. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1772. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1773. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1774. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1775. }
  1776. } break;
  1777. case LLM_ARCH_GROK:
  1778. {
  1779. if (n_expert == 0) {
  1780. throw std::runtime_error("Grok model cannot have zero experts");
  1781. }
  1782. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1783. // output
  1784. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1785. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1786. // if output is NULL, init from the input tok embed
  1787. if (output == NULL) {
  1788. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1789. }
  1790. for (int i = 0; i < n_layer; ++i) {
  1791. auto & layer = layers[i];
  1792. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1793. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1794. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1795. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1796. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1797. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  1798. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1799. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  1800. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
  1801. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  1802. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  1803. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  1804. }
  1805. } break;
  1806. case LLM_ARCH_DBRX:
  1807. {
  1808. if (n_expert == 0) {
  1809. throw std::runtime_error("DBRX model cannot have zero experts");
  1810. }
  1811. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1812. // output
  1813. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1814. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1815. for (int i = 0; i < n_layer; ++i) {
  1816. auto & layer = layers[i];
  1817. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1818. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  1819. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1820. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  1821. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  1822. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  1823. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  1824. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  1825. }
  1826. } break;
  1827. case LLM_ARCH_BAICHUAN:
  1828. {
  1829. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1830. {
  1831. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1832. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1833. }
  1834. for (int i = 0; i < n_layer; ++i) {
  1835. auto & layer = layers[i];
  1836. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1837. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1838. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1839. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1840. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1841. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1842. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1843. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1844. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1845. }
  1846. } break;
  1847. case LLM_ARCH_FALCON:
  1848. {
  1849. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1850. // output
  1851. {
  1852. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1853. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  1854. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1855. if (!output) {
  1856. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
  1857. }
  1858. }
  1859. for (int i = 0; i < n_layer; ++i) {
  1860. auto & layer = layers[i];
  1861. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1862. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  1863. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1864. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1865. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  1866. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1867. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1868. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1869. }
  1870. } break;
  1871. case LLM_ARCH_STARCODER:
  1872. {
  1873. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1874. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  1875. // output
  1876. {
  1877. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1878. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  1879. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1880. if (!output) {
  1881. // needs to be on GPU
  1882. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1883. }
  1884. }
  1885. for (int i = 0; i < n_layer; ++i) {
  1886. auto & layer = layers[i];
  1887. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1888. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  1889. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  1890. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  1891. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1892. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  1893. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1894. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  1895. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1896. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  1897. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1898. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  1899. }
  1900. } break;
  1901. case LLM_ARCH_BERT:
  1902. case LLM_ARCH_NOMIC_BERT:
  1903. case LLM_ARCH_NOMIC_BERT_MOE:
  1904. {
  1905. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1906. type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
  1907. if (arch == LLM_ARCH_BERT) {
  1908. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  1909. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
  1910. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  1911. cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  1912. cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  1913. }
  1914. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  1915. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  1916. for (int i = 0; i < n_layer; ++i) {
  1917. auto & layer = layers[i];
  1918. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1919. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1920. if (!layer.wqkv) {
  1921. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1922. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  1923. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1924. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  1925. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1926. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  1927. }
  1928. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1929. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  1930. layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
  1931. if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
  1932. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  1933. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
  1934. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  1935. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  1936. } else {
  1937. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1938. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1939. if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
  1940. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  1941. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  1942. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  1943. } else {
  1944. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1945. }
  1946. }
  1947. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  1948. layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
  1949. }
  1950. } break;
  1951. case LLM_ARCH_NEO_BERT:
  1952. {
  1953. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1954. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
  1955. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  1956. cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  1957. cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  1958. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1959. for (int i = 0; i < n_layer; ++i) {
  1960. auto & layer = layers[i];
  1961. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1962. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  1963. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1964. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1965. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff*2}, 0);
  1966. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1967. }
  1968. } break;
  1969. case LLM_ARCH_JINA_BERT_V2:
  1970. {
  1971. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
  1972. type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
  1973. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
  1974. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); //LayerNorm bias
  1975. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
  1976. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {1}, TENSOR_NOT_REQUIRED);
  1977. for (int i = 0; i < n_layer; ++i) {
  1978. auto & layer = layers[i]; // JinaBertLayer
  1979. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1980. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  1981. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1982. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1983. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1984. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  1985. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1986. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1987. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1988. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  1989. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
  1990. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
  1991. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
  1992. layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
  1993. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1994. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1995. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  1996. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
  1997. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1998. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  1999. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  2000. layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
  2001. }
  2002. } break;
  2003. case LLM_ARCH_BLOOM:
  2004. {
  2005. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2006. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  2007. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  2008. // output
  2009. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2010. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2011. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2012. // if output is NULL, init from the input tok embed
  2013. if (output == NULL) {
  2014. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2015. }
  2016. for (int i = 0; i < n_layer; ++i) {
  2017. auto & layer = layers[i];
  2018. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2019. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2020. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2021. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2022. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2023. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2024. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2025. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2026. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2027. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2028. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2029. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2030. }
  2031. } break;
  2032. case LLM_ARCH_MPT:
  2033. {
  2034. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2035. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
  2036. // output
  2037. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2038. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  2039. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2040. if (!output) {
  2041. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
  2042. }
  2043. for (int i = 0; i < n_layer; ++i) {
  2044. auto & layer = layers[i];
  2045. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2046. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2047. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2048. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2049. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2050. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2051. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2052. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2053. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2054. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2055. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2056. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2057. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2058. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2059. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2060. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2061. // AWQ ScaleActivation layer
  2062. layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2063. }
  2064. } break;
  2065. case LLM_ARCH_STABLELM:
  2066. {
  2067. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2068. // output
  2069. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2070. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2071. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2072. for (int i = 0; i < n_layer; ++i) {
  2073. auto & layer = layers[i];
  2074. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2075. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2076. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2077. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2078. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2079. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2080. // optional bias tensors, present in Stable LM 2 1.6B
  2081. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2082. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2083. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2084. // optional q and k layernorms, present in StableLM 2 12B
  2085. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
  2086. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
  2087. // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
  2088. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2089. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2090. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2091. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2092. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2093. }
  2094. } break;
  2095. case LLM_ARCH_QWEN:
  2096. {
  2097. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2098. // output
  2099. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2100. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2101. for (int i = 0; i < n_layer; ++i) {
  2102. auto & layer = layers[i];
  2103. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2104. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
  2105. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
  2106. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2107. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2108. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
  2109. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
  2110. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}, 0);
  2111. }
  2112. } break;
  2113. case LLM_ARCH_QWEN2:
  2114. case LLM_ARCH_QWEN2VL:
  2115. {
  2116. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2117. // output
  2118. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2119. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2120. // if output is NULL, init from the input tok embed
  2121. if (output == NULL) {
  2122. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2123. }
  2124. for (int i = 0; i < n_layer; ++i) {
  2125. auto & layer = layers[i];
  2126. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2127. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2128. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2129. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2130. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2131. // optional bias tensors
  2132. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2133. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2134. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2135. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2136. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2137. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2138. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2139. }
  2140. } break;
  2141. case LLM_ARCH_QWEN2MOE:
  2142. {
  2143. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2144. // output
  2145. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2146. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2147. for (int i = 0; i < n_layer; ++i) {
  2148. auto & layer = layers[i];
  2149. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2150. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2151. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2152. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2153. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2154. // optional bias tensors
  2155. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2156. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2157. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2158. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2159. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2160. if (n_expert == 0) {
  2161. throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
  2162. }
  2163. if (n_expert_used == 0) {
  2164. throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
  2165. }
  2166. // MoE branch
  2167. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2168. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2169. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2170. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2171. // Shared expert branch
  2172. const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
  2173. layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
  2174. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2175. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
  2176. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2177. }
  2178. } break;
  2179. case LLM_ARCH_QWEN3:
  2180. {
  2181. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2182. // output
  2183. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2184. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2185. // if output is NULL, init from the input tok embed
  2186. if (output == NULL) {
  2187. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2188. }
  2189. for (int i = 0; i < n_layer; ++i) {
  2190. auto & layer = layers[i];
  2191. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2192. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2193. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2194. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2195. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2196. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2197. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2198. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2199. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2200. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2201. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2202. }
  2203. } break;
  2204. case LLM_ARCH_QWEN3MOE:
  2205. {
  2206. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2207. // output
  2208. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2209. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2210. // if output is NULL, init from the input tok embed
  2211. if (output == NULL) {
  2212. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2213. }
  2214. for (int i = 0; i < n_layer; ++i) {
  2215. auto & layer = layers[i];
  2216. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2217. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2218. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2219. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2220. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2221. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2222. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2223. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2224. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2225. if (n_expert == 0) {
  2226. throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
  2227. }
  2228. if (n_expert_used == 0) {
  2229. throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
  2230. }
  2231. // MoE branch
  2232. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2233. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2234. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2235. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2236. }
  2237. } break;
  2238. case LLM_ARCH_PHI2:
  2239. {
  2240. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2241. // output
  2242. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2243. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2244. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2245. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, 0);
  2246. for (int i = 0; i < n_layer; ++i) {
  2247. auto & layer = layers[i];
  2248. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2249. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2250. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2251. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2252. if (layer.wqkv == nullptr) {
  2253. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2254. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2255. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2256. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2257. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2258. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2259. }
  2260. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2261. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2262. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2263. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2264. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2265. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2266. }
  2267. } break;
  2268. case LLM_ARCH_PHI3:
  2269. {
  2270. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2271. // output
  2272. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2273. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2274. // if output is NULL, init from the input tok embed
  2275. if (output == NULL) {
  2276. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2277. }
  2278. for (int i = 0; i < n_layer; ++i) {
  2279. auto & layer = layers[i];
  2280. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  2281. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
  2282. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  2283. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  2284. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  2285. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
  2286. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2287. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2288. }
  2289. } break;
  2290. case LLM_ARCH_PHIMOE:
  2291. {
  2292. const int64_t n_embd_head = n_embd / n_head;
  2293. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2294. // output
  2295. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2296. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2297. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
  2298. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0);
  2299. for (int i = 0; i < n_layer; ++i) {
  2300. auto & layer = layers[i];
  2301. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  2302. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
  2303. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
  2304. if (layer.wqkv == nullptr) {
  2305. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2306. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2307. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2308. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2309. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2310. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2311. }
  2312. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  2313. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
  2314. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  2315. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
  2316. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2317. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2318. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  2319. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2320. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2321. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2322. }
  2323. } break;
  2324. case LLM_ARCH_PLAMO:
  2325. {
  2326. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2327. // output
  2328. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2329. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2330. for (int i = 0; i < n_layer; ++i) {
  2331. auto & layer = layers[i];
  2332. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2333. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2334. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2335. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2336. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2337. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2338. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2339. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2340. }
  2341. } break;
  2342. case LLM_ARCH_GPT2:
  2343. {
  2344. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2345. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  2346. // output
  2347. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2348. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2349. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2350. // if output is NULL, init from the input tok embed
  2351. if (output == NULL) {
  2352. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2353. }
  2354. for (int i = 0; i < n_layer; ++i) {
  2355. auto & layer = layers[i];
  2356. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2357. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2358. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2359. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2360. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2361. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2362. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2363. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2364. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2365. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2366. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2367. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2368. }
  2369. } break;
  2370. case LLM_ARCH_CODESHELL:
  2371. {
  2372. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2373. // if tok embd is NULL, init from output
  2374. if (tok_embd == NULL) {
  2375. tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2376. }
  2377. // output
  2378. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2379. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2380. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2381. for (int i = 0; i < n_layer; ++i) {
  2382. auto & layer = layers[i];
  2383. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2384. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2385. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2386. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2387. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2388. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2389. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2390. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2391. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2392. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2393. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2394. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2395. }
  2396. } break;
  2397. case LLM_ARCH_ORION:
  2398. {
  2399. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2400. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2401. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2402. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2403. for (int i = 0; i < n_layer; ++i) {
  2404. auto & layer = layers[i];
  2405. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2406. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2407. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2408. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2409. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2410. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2411. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2412. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2413. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2414. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2415. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2416. }
  2417. } break;
  2418. case LLM_ARCH_INTERNLM2:
  2419. {
  2420. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2421. // output
  2422. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2423. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2424. for (int i = 0; i < n_layer; ++i) {
  2425. auto & layer = layers[i];
  2426. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2427. // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2428. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2429. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2430. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2431. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2432. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2433. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2434. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2435. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2436. }
  2437. } break;
  2438. case LLM_ARCH_GEMMA:
  2439. {
  2440. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2441. // output
  2442. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2443. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  2444. for (int i = 0; i < n_layer; ++i) {
  2445. auto & layer = layers[i];
  2446. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2447. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2448. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2449. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2450. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2451. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2452. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2453. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2454. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2455. }
  2456. } break;
  2457. case LLM_ARCH_GEMMA2:
  2458. {
  2459. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2460. // output
  2461. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2462. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  2463. for (int i = 0; i < n_layer; ++i) {
  2464. auto & layer = layers[i];
  2465. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2466. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2467. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2468. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2469. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2470. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  2471. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2472. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2473. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2474. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2475. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  2476. }
  2477. } break;
  2478. case LLM_ARCH_GEMMA3:
  2479. {
  2480. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2481. // output
  2482. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2483. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2484. // if output is NULL, init from the input tok embed
  2485. if (output == NULL) {
  2486. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2487. }
  2488. for (int i = 0; i < n_layer; ++i) {
  2489. auto & layer = layers[i];
  2490. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2491. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2492. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2493. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2494. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2495. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  2496. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2497. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2498. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2499. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2500. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2501. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2502. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  2503. }
  2504. } break;
  2505. case LLM_ARCH_STARCODER2:
  2506. {
  2507. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2508. // output
  2509. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2510. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2511. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2512. // if output is NULL, init from the input tok embed
  2513. if (output == NULL) {
  2514. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2515. }
  2516. for (int i = 0; i < n_layer; ++i) {
  2517. auto & layer = layers[i];
  2518. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2519. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2520. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2521. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2522. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2523. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2524. // optional bias tensors
  2525. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2526. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2527. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2528. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2529. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2530. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2531. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2532. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2533. // optional bias tensors
  2534. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2535. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}, 0);
  2536. }
  2537. } break;
  2538. case LLM_ARCH_MAMBA:
  2539. {
  2540. const int64_t d_conv = hparams.ssm_d_conv;
  2541. const int64_t d_inner = hparams.ssm_d_inner;
  2542. const int64_t d_state = hparams.ssm_d_state;
  2543. const int64_t dt_rank = hparams.ssm_dt_rank;
  2544. // only an expansion factor of 2 is supported for now
  2545. if (2 * n_embd != d_inner) {
  2546. throw std::runtime_error("only an expansion factor of 2 is supported for now");
  2547. }
  2548. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2549. // output
  2550. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2551. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2552. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  2553. if (output == NULL) {
  2554. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2555. }
  2556. for (int i = 0; i < n_layer; ++i) {
  2557. auto & layer = layers[i];
  2558. // norm
  2559. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2560. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
  2561. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
  2562. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
  2563. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
  2564. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
  2565. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
  2566. // no "weight" suffix for these
  2567. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
  2568. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
  2569. // out_proj
  2570. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  2571. }
  2572. } break;
  2573. case LLM_ARCH_XVERSE:
  2574. {
  2575. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2576. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2577. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2578. for (int i = 0; i < n_layer; ++i) {
  2579. auto & layer = layers[i];
  2580. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2581. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2582. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2583. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2584. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2585. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2586. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2587. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2588. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2589. }
  2590. } break;
  2591. case LLM_ARCH_COMMAND_R:
  2592. {
  2593. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2594. // output
  2595. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2596. // init output from the input tok embed
  2597. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2598. for (int i = 0; i < n_layer; ++i) {
  2599. auto & layer = layers[i];
  2600. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2601. if (n_layer >= 64){
  2602. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
  2603. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
  2604. }
  2605. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2606. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2607. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2608. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2609. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2610. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2611. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2612. }
  2613. } break;
  2614. case LLM_ARCH_COHERE2:
  2615. {
  2616. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2617. // output
  2618. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2619. // init output from the input tok embed
  2620. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
  2621. TENSOR_DUPLICATED);
  2622. for (int i = 0; i < n_layer; ++i) {
  2623. auto & layer = layers[i];
  2624. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  2625. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
  2626. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
  2627. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
  2628. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  2629. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
  2630. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  2631. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
  2632. }
  2633. }
  2634. break;
  2635. case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
  2636. {
  2637. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2638. // output
  2639. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2640. // if output is NULL, init from the input tok embed
  2641. if (output == NULL) {
  2642. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2643. }
  2644. for (int i = 0; i < n_layer; ++i) {
  2645. auto & layer = layers[i];
  2646. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2647. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2648. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2649. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2650. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2651. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2652. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2653. }
  2654. } break;
  2655. case LLM_ARCH_OLMO2:
  2656. {
  2657. const int64_t n_embd_head = n_embd / n_head;
  2658. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2659. // output
  2660. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2661. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2662. for (int i = 0; i < n_layer; ++i) {
  2663. auto & layer = layers[i];
  2664. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2665. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2666. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2667. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2668. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
  2669. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
  2670. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  2671. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2672. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2673. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2674. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  2675. }
  2676. } break;
  2677. case LLM_ARCH_OLMOE:
  2678. {
  2679. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2680. // output
  2681. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2682. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2683. for (int i = 0; i < n_layer; ++i) {
  2684. auto & layer = layers[i];
  2685. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2686. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2687. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2688. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2689. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2690. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
  2691. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
  2692. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2693. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2694. if (n_expert == 0) {
  2695. throw std::runtime_error("n_expert must be > 0");
  2696. }
  2697. if (n_expert_used == 0) {
  2698. throw std::runtime_error("n_expert_used must be > 0");
  2699. }
  2700. // MoE branch
  2701. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2702. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  2703. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2704. }
  2705. } break;
  2706. case LLM_ARCH_OPENELM:
  2707. {
  2708. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2709. // output
  2710. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2711. // init output from the input tok embed
  2712. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2713. for (int i = 0; i < n_layer; ++i) {
  2714. const int64_t n_head = hparams.n_head(i);
  2715. const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head;
  2716. const int64_t n_ff = hparams.n_ff(i);
  2717. auto & layer = layers[i];
  2718. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2719. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
  2720. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2721. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2722. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
  2723. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2724. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2725. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2726. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2727. }
  2728. } break;
  2729. case LLM_ARCH_GPTNEOX:
  2730. {
  2731. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2732. // output
  2733. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2734. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2735. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2736. for (int i = 0; i < n_layer; ++i) {
  2737. auto & layer = layers[i];
  2738. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2739. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2740. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2741. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2742. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2743. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2744. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2745. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2746. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2747. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2748. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2749. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2750. }
  2751. } break;
  2752. case LLM_ARCH_ARCTIC:
  2753. {
  2754. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2755. // output
  2756. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2757. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2758. // if output is NULL, init from the input tok embed
  2759. if (output == NULL) {
  2760. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2761. }
  2762. for (int i = 0; i < n_layer; ++i) {
  2763. auto & layer = layers[i];
  2764. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2765. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2766. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2767. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2768. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2769. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2770. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
  2771. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
  2772. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd}, 0);
  2773. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2774. layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
  2775. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
  2776. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  2777. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2778. }
  2779. } break;
  2780. case LLM_ARCH_DEEPSEEK:
  2781. {
  2782. const int64_t n_ff_exp = hparams.n_ff_exp;
  2783. const int64_t n_expert_shared = hparams.n_expert_shared;
  2784. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2785. // output
  2786. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2787. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2788. for (int i = 0; i < n_layer; ++i) {
  2789. auto & layer = layers[i];
  2790. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2791. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2792. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2793. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2794. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2795. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2796. if (i < (int) hparams.n_layer_dense_lead) {
  2797. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2798. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2799. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2800. } else {
  2801. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2802. if (n_expert == 0) {
  2803. throw std::runtime_error("n_expert must be > 0");
  2804. }
  2805. if (n_expert_used == 0) {
  2806. throw std::runtime_error("n_expert_used must be > 0");
  2807. }
  2808. // MoE branch
  2809. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2810. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2811. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2812. // Shared expert branch
  2813. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  2814. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  2815. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  2816. }
  2817. }
  2818. } break;
  2819. case LLM_ARCH_DEEPSEEK2:
  2820. {
  2821. const bool is_lite = (hparams.n_layer == 27);
  2822. const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
  2823. // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
  2824. const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
  2825. const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
  2826. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  2827. const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
  2828. const int64_t q_lora_rank = hparams.n_lora_q;
  2829. const int64_t kv_lora_rank = hparams.n_lora_kv;
  2830. const int64_t n_ff_exp = hparams.n_ff_exp;
  2831. const int64_t n_expert_shared = hparams.n_expert_shared;
  2832. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2833. // output
  2834. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2835. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2836. for (int i = 0; i < n_layer; ++i) {
  2837. auto & layer = layers[i];
  2838. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2839. if (!is_lite) {
  2840. layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
  2841. }
  2842. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  2843. if (!is_lite) {
  2844. layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
  2845. layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
  2846. } else {
  2847. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
  2848. }
  2849. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
  2850. // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
  2851. if (is_mla) {
  2852. layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
  2853. layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
  2854. } else {
  2855. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
  2856. }
  2857. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
  2858. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2859. if (i < (int) hparams.n_layer_dense_lead) {
  2860. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2861. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2862. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2863. } else {
  2864. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2865. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  2866. if (n_expert == 0) {
  2867. throw std::runtime_error("n_expert must be > 0");
  2868. }
  2869. if (n_expert_used == 0) {
  2870. throw std::runtime_error("n_expert_used must be > 0");
  2871. }
  2872. // MoE branch
  2873. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2874. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2875. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2876. // Shared expert branch
  2877. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  2878. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  2879. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  2880. }
  2881. }
  2882. } break;
  2883. case LLM_ARCH_PLM:
  2884. {
  2885. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  2886. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  2887. const int64_t kv_lora_rank = hparams.n_lora_kv;
  2888. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2889. // output
  2890. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2891. // output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2892. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2893. for (int i = 0; i < n_layer; ++i) {
  2894. auto & layer = layers[i];
  2895. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2896. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2897. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
  2898. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  2899. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
  2900. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
  2901. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2902. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2903. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2904. }
  2905. } break;
  2906. case LLM_ARCH_BITNET:
  2907. {
  2908. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2909. // output
  2910. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2911. for (int i = 0; i < n_layer; ++i) {
  2912. auto & layer = layers[i];
  2913. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2914. layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
  2915. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2916. layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  2917. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2918. layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  2919. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2920. layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  2921. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2922. layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  2923. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2924. layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
  2925. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2926. layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  2927. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2928. layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  2929. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2930. layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  2931. }
  2932. } break;
  2933. case LLM_ARCH_T5:
  2934. {
  2935. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  2936. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2937. // output
  2938. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2939. output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2940. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2941. // if output is NULL, init from the input tok embed
  2942. if (output == NULL) {
  2943. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2944. }
  2945. for (int i = 0; i < n_layer; ++i) {
  2946. auto & layer = layers[i];
  2947. layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
  2948. layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  2949. layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2950. layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2951. layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2952. layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  2953. layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
  2954. layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2955. layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2956. layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2957. layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
  2958. layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  2959. layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2960. layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2961. layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2962. layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  2963. layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
  2964. // this tensor seems to be unused in HF transformers implementation
  2965. layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  2966. layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2967. layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2968. layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2969. layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  2970. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
  2971. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2972. layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2973. layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2974. }
  2975. } break;
  2976. case LLM_ARCH_T5ENCODER:
  2977. {
  2978. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  2979. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2980. // output
  2981. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2982. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2983. // if output is NULL, init from the input tok embed
  2984. if (output == NULL) {
  2985. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2986. }
  2987. for (int i = 0; i < n_layer; ++i) {
  2988. auto & layer = layers[i];
  2989. layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
  2990. layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  2991. layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2992. layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2993. layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2994. layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  2995. layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
  2996. layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2997. layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2998. layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2999. }
  3000. } break;
  3001. case LLM_ARCH_JAIS:
  3002. {
  3003. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3004. // output
  3005. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3006. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3007. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3008. for (int i = 0; i < n_layer; ++i) {
  3009. auto & layer = layers[i];
  3010. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3011. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3012. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3013. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  3014. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3015. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3016. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3017. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3018. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3019. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3020. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3021. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, 0);
  3022. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3023. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  3024. }
  3025. } break;
  3026. case LLM_ARCH_CHATGLM:
  3027. {
  3028. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3029. // output
  3030. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3031. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3032. // if output is NULL, init from the input tok embed
  3033. if (output == NULL) {
  3034. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3035. }
  3036. for (int i = 0; i < n_layer; ++i) {
  3037. auto & layer = layers[i];
  3038. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3039. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3040. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3041. if (layer.wqkv == nullptr) {
  3042. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3043. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3044. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3045. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3046. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3047. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3048. }
  3049. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3050. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3051. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  3052. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3053. }
  3054. } break;
  3055. case LLM_ARCH_GLM4:
  3056. {
  3057. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3058. // output
  3059. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3060. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3061. // if output is NULL, init from the input tok embed
  3062. if (output == NULL) {
  3063. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3064. }
  3065. for (int i = 0; i < n_layer; ++i) {
  3066. auto & layer = layers[i];
  3067. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3068. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3069. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3070. if (layer.wqkv == nullptr) {
  3071. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3072. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3073. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3074. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3075. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3076. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3077. }
  3078. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3079. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3080. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3081. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3082. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  3083. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3084. }
  3085. } break;
  3086. case LLM_ARCH_NEMOTRON:
  3087. {
  3088. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3089. // output
  3090. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3091. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3092. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3093. for (int i = 0; i < n_layer; ++i) {
  3094. auto & layer = layers[i];
  3095. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3096. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3097. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3098. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3099. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3100. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3101. // optional bias tensors
  3102. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3103. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3104. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3105. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3106. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3107. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3108. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3109. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3110. // optional MLP bias
  3111. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3112. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  3113. }
  3114. } break;
  3115. case LLM_ARCH_EXAONE:
  3116. {
  3117. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3118. // output
  3119. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3120. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3121. // if output is NULL, init from the input tok embed
  3122. if (output == NULL) {
  3123. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3124. }
  3125. for (int i = 0; i < n_layer; ++i) {
  3126. auto & layer = layers[i];
  3127. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3128. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3129. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3130. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3131. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3132. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3133. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3134. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3135. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3136. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3137. }
  3138. } break;
  3139. case LLM_ARCH_RWKV6:
  3140. {
  3141. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3142. // Block 0, LN0
  3143. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  3144. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  3145. // output
  3146. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3147. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3148. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3149. const int time_mix_extra_dim = hparams.time_mix_extra_dim;
  3150. const int time_decay_extra_dim = hparams.time_decay_extra_dim;
  3151. const int head_size = hparams.wkv_head_size;
  3152. const int attn_hidden_size = n_embd;
  3153. const int ffn_size = hparams.n_ff_arr[0];
  3154. for (int i = 0; i < n_layer; ++i) {
  3155. auto & layer = layers[i];
  3156. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3157. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3158. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
  3159. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
  3160. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
  3161. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
  3162. layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
  3163. layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  3164. layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  3165. layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  3166. layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  3167. layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  3168. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED);
  3169. GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
  3170. layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
  3171. layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
  3172. layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
  3173. layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
  3174. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  3175. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  3176. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  3177. layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
  3178. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
  3179. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
  3180. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  3181. layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
  3182. layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
  3183. layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
  3184. layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
  3185. layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
  3186. }
  3187. } break;
  3188. case LLM_ARCH_RWKV6QWEN2:
  3189. {
  3190. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3191. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3192. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  3193. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3194. const int time_mix_extra_dim = hparams.time_mix_extra_dim;
  3195. const int time_decay_extra_dim = hparams.time_decay_extra_dim;
  3196. const int head_size = hparams.wkv_head_size;
  3197. const int attn_hidden_size = n_embd;
  3198. const int n_head_kv = hparams.n_head_kv();
  3199. int attn_key_value_size;
  3200. if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
  3201. attn_key_value_size = attn_hidden_size;
  3202. } else {
  3203. attn_key_value_size = n_head_kv * head_size;
  3204. }
  3205. for (int i = 0; i < n_layer; ++i) {
  3206. auto & layer = layers[i];
  3207. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3208. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
  3209. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
  3210. layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
  3211. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
  3212. layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, TENSOR_NOT_REQUIRED);
  3213. layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
  3214. layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
  3215. layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
  3216. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
  3217. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
  3218. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  3219. layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
  3220. // optional bias tensors
  3221. layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
  3222. layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
  3223. layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, TENSOR_NOT_REQUIRED);
  3224. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  3225. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3226. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3227. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3228. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3229. }
  3230. } break;
  3231. case LLM_ARCH_RWKV7:
  3232. {
  3233. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3234. // Block 0, LN0
  3235. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  3236. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  3237. // output
  3238. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3239. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3240. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3241. const int n_lora_decay = hparams.n_lora_decay;
  3242. const int n_lora_iclr = hparams.n_lora_iclr;
  3243. const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
  3244. const int n_lora_gate = hparams.n_lora_gate;
  3245. const int attn_hidden_size = n_embd;
  3246. const int ffn_size = hparams.n_ff_arr[0];
  3247. for (int i = 0; i < n_layer; ++i) {
  3248. auto & layer = layers[i];
  3249. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3250. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3251. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
  3252. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
  3253. layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
  3254. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
  3255. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
  3256. layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
  3257. layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
  3258. layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
  3259. if (i == 0) {
  3260. // actually not used
  3261. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  3262. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
  3263. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
  3264. } else {
  3265. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  3266. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
  3267. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
  3268. }
  3269. layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, 0);
  3270. layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, 0);
  3271. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
  3272. layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
  3273. layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
  3274. layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
  3275. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  3276. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  3277. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  3278. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
  3279. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
  3280. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  3281. layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
  3282. layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
  3283. layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
  3284. }
  3285. } break;
  3286. case LLM_ARCH_ARWKV7:
  3287. {
  3288. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3289. // output
  3290. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3291. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3292. const int n_lora_decay = hparams.n_lora_decay;
  3293. const int n_lora_iclr = hparams.n_lora_iclr;
  3294. const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
  3295. const int n_lora_gate = hparams.n_lora_gate;
  3296. const int attn_hidden_size = n_embd;
  3297. for (int i = 0; i < n_layer; ++i) {
  3298. auto & layer = layers[i];
  3299. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3300. layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
  3301. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
  3302. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
  3303. layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
  3304. layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
  3305. layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
  3306. if (i == 0) {
  3307. // actually not used
  3308. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  3309. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
  3310. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
  3311. } else {
  3312. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  3313. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
  3314. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
  3315. }
  3316. layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, TENSOR_NOT_REQUIRED);
  3317. layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, TENSOR_NOT_REQUIRED);
  3318. try {
  3319. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
  3320. } catch(std::runtime_error & e) {
  3321. // ARWKV models may not have gate tensors
  3322. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
  3323. }
  3324. layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
  3325. layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
  3326. layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
  3327. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  3328. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  3329. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  3330. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3331. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3332. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  3333. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3334. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3335. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3336. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3337. }
  3338. } break;
  3339. case LLM_ARCH_CHAMELEON:
  3340. {
  3341. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3342. // output
  3343. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3344. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3345. // if output is NULL, init from the input tok embed
  3346. if (output == NULL) {
  3347. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3348. }
  3349. for (int i = 0; i < n_layer; ++i) {
  3350. auto & layer = layers[i];
  3351. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3352. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
  3353. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
  3354. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
  3355. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
  3356. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3357. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3358. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3359. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3360. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3361. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3362. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3363. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3364. }
  3365. } break;
  3366. case LLM_ARCH_WAVTOKENIZER_DEC:
  3367. {
  3368. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
  3369. conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
  3370. conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
  3371. // posnet
  3372. {
  3373. const int64_t n_embd = hparams.posnet.n_embd;
  3374. for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
  3375. auto & layer = layers[i].posnet;
  3376. // posnet:
  3377. //
  3378. // - resnet
  3379. // - resnet
  3380. // - attn
  3381. // - resnet
  3382. // - resnet
  3383. // - norm
  3384. //
  3385. switch (i) {
  3386. case 0:
  3387. case 1:
  3388. case 3:
  3389. case 4:
  3390. {
  3391. layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
  3392. layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
  3393. layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
  3394. layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
  3395. layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
  3396. layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
  3397. layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
  3398. layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
  3399. } break;
  3400. case 2:
  3401. {
  3402. layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
  3403. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
  3404. layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
  3405. layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
  3406. layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
  3407. layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
  3408. layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
  3409. layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
  3410. layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
  3411. layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
  3412. } break;
  3413. case 5:
  3414. {
  3415. layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
  3416. layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
  3417. } break;
  3418. default: GGML_ABORT("unknown posnet layer");
  3419. };
  3420. }
  3421. }
  3422. GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
  3423. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
  3424. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
  3425. // convnext
  3426. {
  3427. const int64_t n_embd = hparams.convnext.n_embd;
  3428. for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
  3429. auto & layer = layers[i].convnext;
  3430. layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
  3431. layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
  3432. layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
  3433. layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
  3434. layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
  3435. layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
  3436. layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
  3437. layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
  3438. layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
  3439. }
  3440. // output
  3441. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3442. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3443. }
  3444. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
  3445. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
  3446. } break;
  3447. case LLM_ARCH_BAILINGMOE:
  3448. {
  3449. const int64_t n_ff_exp = hparams.n_ff_exp;
  3450. const int64_t n_expert_shared = hparams.n_expert_shared;
  3451. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3452. // output
  3453. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3454. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3455. for (int i = 0; i < n_layer; ++i) {
  3456. auto & layer = layers[i];
  3457. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3458. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
  3459. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  3460. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  3461. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
  3462. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3463. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3464. if (n_expert == 0) {
  3465. throw std::runtime_error("n_expert must be > 0");
  3466. }
  3467. if (n_expert_used == 0) {
  3468. throw std::runtime_error("n_expert_used must be > 0");
  3469. }
  3470. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3471. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  3472. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3473. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3474. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  3475. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3476. }
  3477. } break;
  3478. case LLM_ARCH_DOTS1:
  3479. {
  3480. const int64_t n_ff_exp = hparams.n_ff_exp;
  3481. const int64_t n_expert_shared = hparams.n_expert_shared;
  3482. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3483. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3484. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3485. for (int i = 0; i < n_layer; ++i) {
  3486. auto & layer = layers[i];
  3487. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3488. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3489. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3490. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3491. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3492. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  3493. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  3494. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3495. if (i < (int) hparams.n_layer_dense_lead) {
  3496. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3497. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3498. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3499. } else {
  3500. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3501. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  3502. if (n_expert == 0) {
  3503. throw std::runtime_error("n_expert must be > 0");
  3504. }
  3505. if (n_expert_used == 0) {
  3506. throw std::runtime_error("n_expert_used must be > 0");
  3507. }
  3508. // MoE branch
  3509. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3510. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  3511. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3512. // Shared expert branch
  3513. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3514. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  3515. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3516. }
  3517. }
  3518. } break;
  3519. case LLM_ARCH_ARCEE:
  3520. {
  3521. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3522. // output
  3523. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3524. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3525. // if output is NULL, init from the input tok embed
  3526. if (output == NULL) {
  3527. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3528. }
  3529. for (int i = 0; i < n_layer; ++i) {
  3530. auto & layer = layers[i];
  3531. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3532. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3533. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3534. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3535. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3536. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3537. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3538. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3539. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3540. }
  3541. } break;
  3542. default:
  3543. throw std::runtime_error("unknown architecture");
  3544. }
  3545. if (n_moved_tensors > 0) {
  3546. LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
  3547. __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
  3548. ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
  3549. }
  3550. }
  3551. ml.done_getting_tensors();
  3552. ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
  3553. pimpl->mappings.reserve(ml.mappings.size());
  3554. // create the backend buffers
  3555. std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
  3556. ctx_bufs.reserve(ctx_map.size());
  3557. // Ensure we have enough capacity for the maximum backend buffer we will potentially create
  3558. const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
  3559. pimpl->bufs.reserve(n_max_backend_buffer);
  3560. for (auto & it : ctx_map) {
  3561. ggml_backend_buffer_type_t buft = it.first;
  3562. ggml_context * ctx = it.second;
  3563. // skip contexts without tensors
  3564. if (ggml_get_first_tensor(ctx) == nullptr) {
  3565. continue;
  3566. }
  3567. llama_buf_map buf_map;
  3568. buf_map.reserve(n_max_backend_buffer);
  3569. // check if it is possible to use buffer_from_host_ptr with this buffer type
  3570. ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
  3571. if (!dev) {
  3572. // FIXME: workaround for CPU backend buft having a NULL device
  3573. dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  3574. if (!dev) {
  3575. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  3576. }
  3577. }
  3578. ggml_backend_dev_props props;
  3579. ggml_backend_dev_get_props(dev, &props);
  3580. bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
  3581. bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
  3582. if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
  3583. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  3584. // only the mmap region containing the tensors in the model is mapped to the backend buffer
  3585. // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
  3586. // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
  3587. void * addr = nullptr;
  3588. size_t first, last; // NOLINT
  3589. ml.get_mapping_range(&first, &last, &addr, idx, ctx);
  3590. if (first >= last) {
  3591. continue;
  3592. }
  3593. const size_t max_size = ggml_get_max_tensor_size(ctx);
  3594. ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
  3595. if (buf == nullptr) {
  3596. throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
  3597. }
  3598. pimpl->bufs.emplace_back(buf);
  3599. buf_map.emplace(idx, buf);
  3600. }
  3601. }
  3602. else {
  3603. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
  3604. if (buf == nullptr) {
  3605. throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
  3606. }
  3607. pimpl->bufs.emplace_back(buf);
  3608. if (use_mlock && ggml_backend_buffer_is_host(buf)) {
  3609. pimpl->mlock_bufs.emplace_back(new llama_mlock);
  3610. auto & mlock_buf = pimpl->mlock_bufs.back();
  3611. mlock_buf->init (ggml_backend_buffer_get_base(buf));
  3612. mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
  3613. }
  3614. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  3615. buf_map.emplace(idx, buf);
  3616. }
  3617. }
  3618. if (pimpl->bufs.empty()) {
  3619. throw std::runtime_error("failed to allocate buffer");
  3620. }
  3621. for (auto & buf : buf_map) {
  3622. // indicate that this buffer contains weights
  3623. // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
  3624. ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
  3625. }
  3626. ctx_bufs.emplace_back(ctx, buf_map);
  3627. }
  3628. if (llama_supports_gpu_offload()) {
  3629. const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
  3630. LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
  3631. if (n_gpu_layers > (int) hparams.n_layer) {
  3632. LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
  3633. }
  3634. const int max_backend_supported_layers = hparams.n_layer + 1;
  3635. const int max_offloadable_layers = hparams.n_layer + 1;
  3636. LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
  3637. }
  3638. // print memory requirements per buffer type
  3639. for (auto & buf : pimpl->bufs) {
  3640. LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
  3641. }
  3642. // populate tensors_by_name
  3643. for (auto & ctx : pimpl->ctxs) {
  3644. for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
  3645. tensors_by_name.emplace_back(ggml_get_name(cur), cur);
  3646. }
  3647. }
  3648. // load tensor data
  3649. for (auto & it : ctx_bufs) {
  3650. ggml_context * ctx = it.first;
  3651. auto & bufs = it.second;
  3652. if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
  3653. return false;
  3654. }
  3655. }
  3656. if (use_mmap_buffer) {
  3657. for (auto & mapping : ml.mappings) {
  3658. pimpl->mappings.emplace_back(std::move(mapping));
  3659. }
  3660. }
  3661. return true;
  3662. }
  3663. std::string llama_model::arch_name() const {
  3664. return llm_arch_name(arch);
  3665. }
  3666. std::string llama_model::type_name() const {
  3667. return llm_type_name(type);
  3668. }
  3669. std::string llama_model::desc() const {
  3670. return pimpl->desc_str;
  3671. }
  3672. size_t llama_model::size() const {
  3673. return pimpl->n_bytes;
  3674. }
  3675. size_t llama_model::n_tensors() const {
  3676. return tensors_by_name.size();
  3677. }
  3678. size_t llama_model::n_devices() const {
  3679. return devices.size();
  3680. }
  3681. uint64_t llama_model::n_elements() const {
  3682. return pimpl->n_elements;
  3683. }
  3684. void llama_model::print_info() const {
  3685. const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
  3686. auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
  3687. bool is_var = false;
  3688. std::vector<uint32_t> v;
  3689. for (uint32_t i = 0; i < n; ++i) {
  3690. v.push_back(f(i));
  3691. if (v[i] != v[0]) {
  3692. is_var = true;
  3693. }
  3694. }
  3695. std::stringstream ss;
  3696. if (is_var) {
  3697. ss << "[";
  3698. for (uint32_t i = 0; i < n; ++i) {
  3699. ss << v[i];
  3700. if (i < n - 1) {
  3701. ss << ", ";
  3702. }
  3703. }
  3704. ss << "]";
  3705. } else {
  3706. ss << v[0];
  3707. }
  3708. return ss.str();
  3709. };
  3710. // hparams
  3711. LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
  3712. LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
  3713. if (!hparams.vocab_only) {
  3714. LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
  3715. LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
  3716. LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
  3717. LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
  3718. LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
  3719. LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
  3720. LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
  3721. LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
  3722. LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
  3723. LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
  3724. LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
  3725. LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
  3726. LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
  3727. LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
  3728. LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
  3729. LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
  3730. LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
  3731. LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
  3732. LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
  3733. LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
  3734. LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
  3735. LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
  3736. LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
  3737. LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
  3738. LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
  3739. LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
  3740. LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
  3741. LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
  3742. LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
  3743. LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
  3744. LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
  3745. LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
  3746. LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
  3747. LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
  3748. LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
  3749. if (!classifier_labels.empty()) {
  3750. LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
  3751. size_t i = 0;
  3752. for (auto label : classifier_labels) {
  3753. LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
  3754. }
  3755. }
  3756. }
  3757. LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
  3758. if (pimpl->n_elements >= 1e12) {
  3759. LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
  3760. } else if (pimpl->n_elements >= 1e9) {
  3761. LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
  3762. } else if (pimpl->n_elements >= 1e6) {
  3763. LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
  3764. } else {
  3765. LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
  3766. }
  3767. // general kv
  3768. LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
  3769. if (arch == LLM_ARCH_DEEPSEEK) {
  3770. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  3771. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  3772. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  3773. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  3774. }
  3775. if (arch == LLM_ARCH_DEEPSEEK2) {
  3776. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  3777. LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
  3778. LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
  3779. LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
  3780. LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
  3781. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  3782. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  3783. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  3784. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  3785. LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
  3786. LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
  3787. }
  3788. if (arch == LLM_ARCH_QWEN2MOE) {
  3789. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  3790. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  3791. }
  3792. if (arch == LLM_ARCH_QWEN3MOE) {
  3793. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  3794. }
  3795. if (arch == LLM_ARCH_MINICPM ||
  3796. arch == LLM_ARCH_GRANITE ||
  3797. arch == LLM_ARCH_GRANITE_MOE) {
  3798. LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
  3799. LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
  3800. LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
  3801. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  3802. }
  3803. if (arch == LLM_ARCH_BAILINGMOE) {
  3804. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  3805. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  3806. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  3807. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  3808. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  3809. }
  3810. vocab.print_info();
  3811. }
  3812. ggml_backend_dev_t llama_model::dev_layer(int il) const {
  3813. return pimpl->dev_layer.at(il).dev;
  3814. }
  3815. ggml_backend_dev_t llama_model::dev_output() const {
  3816. return pimpl->dev_output.dev;
  3817. }
  3818. template<typename F>
  3819. static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
  3820. ggml_init_params params = {
  3821. /*.mem_size =*/ ggml_tensor_overhead()*8,
  3822. /*.mem_buffer =*/ NULL,
  3823. /*.no_alloc =*/ true,
  3824. };
  3825. ggml_context_ptr ctx { ggml_init(params) };
  3826. if (!ctx) {
  3827. throw std::runtime_error(format("failed to create ggml context"));
  3828. }
  3829. ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
  3830. ggml_tensor * op_tensor = fn(ctx.get());
  3831. for (int i = 0; i < GGML_MAX_SRC; i++) {
  3832. if (op_tensor->src[i] != nullptr) {
  3833. assert(op_tensor->src[i]->buffer == nullptr);
  3834. op_tensor->src[i]->buffer = buf.get();
  3835. }
  3836. }
  3837. bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  3838. return op_supported;
  3839. }
  3840. template<typename F>
  3841. static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
  3842. for (const auto & cur : buft_list) {
  3843. ggml_backend_dev_t cur_dev = cur.first;
  3844. ggml_backend_buffer_type_t cur_buft = cur.second;
  3845. if (buft_supported(cur_buft, cur_dev, fn)) {
  3846. return cur_buft;
  3847. }
  3848. }
  3849. throw std::runtime_error(format("no suitable buffer type found"));
  3850. }
  3851. ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
  3852. return ::select_buft(
  3853. *pimpl->dev_layer.at(il).buft_list,
  3854. [&](ggml_context * ctx) {
  3855. ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  3856. ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  3857. return ggml_add(ctx, cur, layer_dir);
  3858. });
  3859. }
  3860. bool llama_model::has_tensor_overrides() const {
  3861. return pimpl->has_tensor_overrides;
  3862. }
  3863. const ggml_tensor * llama_model::get_tensor(const char * name) const {
  3864. auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
  3865. [name](const std::pair<std::string, ggml_tensor *> & it) {
  3866. return it.first == name;
  3867. });
  3868. if (it == tensors_by_name.end()) {
  3869. return nullptr;
  3870. }
  3871. return it->second;
  3872. }
  3873. float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
  3874. return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
  3875. }
  3876. float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
  3877. return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
  3878. }
  3879. ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
  3880. const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
  3881. // choose long/short freq factors based on the context size
  3882. if (layers[il].rope_freqs != nullptr) {
  3883. return layers[il].rope_freqs;
  3884. }
  3885. if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
  3886. return layers[il].rope_long;
  3887. }
  3888. return layers[il].rope_short;
  3889. }
  3890. struct llm_build_llama : public llm_graph_context {
  3891. llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  3892. const int64_t n_embd_head = hparams.n_embd_head_v;
  3893. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  3894. GGML_ASSERT(n_embd_head == hparams.n_rot);
  3895. ggml_tensor * cur;
  3896. ggml_tensor * inpL;
  3897. inpL = build_inp_embd(model.tok_embd);
  3898. // inp_pos - contains the positions
  3899. ggml_tensor * inp_pos = build_inp_pos();
  3900. auto * inp_attn = build_attn_inp_kv_unified();
  3901. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  3902. ggml_tensor * inp_out_ids = build_inp_out_ids();
  3903. for (int il = 0; il < n_layer; ++il) {
  3904. ggml_tensor * inpSA = inpL;
  3905. // norm
  3906. cur = build_norm(inpL,
  3907. model.layers[il].attn_norm, NULL,
  3908. LLM_NORM_RMS, il);
  3909. cb(cur, "attn_norm", il);
  3910. // self-attention
  3911. {
  3912. // rope freq factors for llama3; may return nullptr for llama2 and other models
  3913. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  3914. // compute Q and K and RoPE them
  3915. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  3916. cb(Qcur, "Qcur", il);
  3917. if (model.layers[il].bq) {
  3918. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  3919. cb(Qcur, "Qcur", il);
  3920. }
  3921. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  3922. cb(Kcur, "Kcur", il);
  3923. if (model.layers[il].bk) {
  3924. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  3925. cb(Kcur, "Kcur", il);
  3926. }
  3927. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  3928. cb(Vcur, "Vcur", il);
  3929. if (model.layers[il].bv) {
  3930. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  3931. cb(Vcur, "Vcur", il);
  3932. }
  3933. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  3934. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  3935. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  3936. Qcur = ggml_rope_ext(
  3937. ctx0, Qcur, inp_pos, rope_factors,
  3938. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  3939. ext_factor, attn_factor, beta_fast, beta_slow
  3940. );
  3941. Kcur = ggml_rope_ext(
  3942. ctx0, Kcur, inp_pos, rope_factors,
  3943. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  3944. ext_factor, attn_factor, beta_fast, beta_slow
  3945. );
  3946. cb(Qcur, "Qcur", il);
  3947. cb(Kcur, "Kcur", il);
  3948. cb(Vcur, "Vcur", il);
  3949. cur = build_attn(inp_attn, gf,
  3950. model.layers[il].wo, model.layers[il].bo,
  3951. Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
  3952. cb(cur, "attn_out", il);
  3953. }
  3954. if (il == n_layer - 1 && inp_out_ids) {
  3955. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  3956. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  3957. }
  3958. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  3959. cb(ffn_inp, "ffn_inp", il);
  3960. // feed-forward network (non-MoE)
  3961. if (model.layers[il].ffn_gate_inp == nullptr) {
  3962. cur = build_norm(ffn_inp,
  3963. model.layers[il].ffn_norm, NULL,
  3964. LLM_NORM_RMS, il);
  3965. cb(cur, "ffn_norm", il);
  3966. cur = build_ffn(cur,
  3967. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  3968. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  3969. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  3970. NULL,
  3971. LLM_FFN_SILU, LLM_FFN_PAR, il);
  3972. cb(cur, "ffn_out", il);
  3973. } else {
  3974. // MoE branch
  3975. cur = build_norm(ffn_inp,
  3976. model.layers[il].ffn_norm, NULL,
  3977. LLM_NORM_RMS, il);
  3978. cb(cur, "ffn_norm", il);
  3979. cur = build_moe_ffn(cur,
  3980. model.layers[il].ffn_gate_inp,
  3981. model.layers[il].ffn_up_exps,
  3982. model.layers[il].ffn_gate_exps,
  3983. model.layers[il].ffn_down_exps,
  3984. nullptr,
  3985. n_expert, n_expert_used,
  3986. LLM_FFN_SILU, true,
  3987. false, 0.0,
  3988. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  3989. il);
  3990. cb(cur, "ffn_moe_out", il);
  3991. }
  3992. cur = ggml_add(ctx0, cur, ffn_inp);
  3993. cb(cur, "ffn_out", il);
  3994. cur = build_cvec(cur, il);
  3995. cb(cur, "l_out", il);
  3996. // input for next layer
  3997. inpL = cur;
  3998. }
  3999. cur = inpL;
  4000. cur = build_norm(cur,
  4001. model.output_norm, NULL,
  4002. LLM_NORM_RMS, -1);
  4003. cb(cur, "result_norm", -1);
  4004. res->t_embd = cur;
  4005. // lm_head
  4006. cur = build_lora_mm(model.output, cur);
  4007. cb(cur, "result_output", -1);
  4008. res->t_logits = cur;
  4009. ggml_build_forward_expand(gf, cur);
  4010. }
  4011. };
  4012. struct llm_build_llama_iswa : public llm_graph_context {
  4013. llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  4014. const int64_t n_embd_head = hparams.n_embd_head_v;
  4015. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4016. GGML_ASSERT(n_embd_head == hparams.n_rot);
  4017. ggml_tensor * cur;
  4018. ggml_tensor * inpL;
  4019. inpL = build_inp_embd(model.tok_embd);
  4020. // inp_pos - contains the positions
  4021. ggml_tensor * inp_pos = build_inp_pos();
  4022. // temperature tuning
  4023. ggml_tensor * inp_attn_scale = nullptr;
  4024. inp_attn_scale = build_inp_attn_scale();
  4025. auto * inp_attn = build_attn_inp_kv_unified_iswa();
  4026. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  4027. ggml_tensor * inp_out_ids = build_inp_out_ids();
  4028. for (int il = 0; il < n_layer; ++il) {
  4029. ggml_tensor * inpSA = inpL;
  4030. const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
  4031. // norm
  4032. cur = build_norm(inpL,
  4033. model.layers[il].attn_norm, NULL,
  4034. LLM_NORM_RMS, il);
  4035. cb(cur, "attn_norm", il);
  4036. // self-attention
  4037. {
  4038. // rope freq factors for llama3; may return nullptr for llama2 and other models
  4039. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  4040. // compute Q and K and RoPE them
  4041. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  4042. cb(Qcur, "Qcur", il);
  4043. if (model.layers[il].bq) {
  4044. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  4045. cb(Qcur, "Qcur", il);
  4046. }
  4047. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  4048. cb(Kcur, "Kcur", il);
  4049. if (model.layers[il].bk) {
  4050. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  4051. cb(Kcur, "Kcur", il);
  4052. }
  4053. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  4054. cb(Vcur, "Vcur", il);
  4055. if (model.layers[il].bv) {
  4056. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  4057. cb(Vcur, "Vcur", il);
  4058. }
  4059. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4060. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4061. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4062. if (use_rope) {
  4063. Qcur = ggml_rope_ext(
  4064. ctx0, Qcur, inp_pos, rope_factors,
  4065. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4066. ext_factor, attn_factor, beta_fast, beta_slow
  4067. );
  4068. Kcur = ggml_rope_ext(
  4069. ctx0, Kcur, inp_pos, rope_factors,
  4070. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4071. ext_factor, attn_factor, beta_fast, beta_slow
  4072. );
  4073. } else if (inp_attn_scale) {
  4074. Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
  4075. }
  4076. cb(Qcur, "Qcur", il);
  4077. cb(Kcur, "Kcur", il);
  4078. cb(Vcur, "Vcur", il);
  4079. if (use_rope && hparams.use_kq_norm) {
  4080. // Llama4TextL2Norm
  4081. Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
  4082. Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
  4083. cb(Qcur, "Qcur_normed", il);
  4084. cb(Kcur, "Kcur_normed", il);
  4085. }
  4086. cur = build_attn(inp_attn, gf,
  4087. model.layers[il].wo, model.layers[il].bo,
  4088. Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
  4089. cb(cur, "attn_out", il);
  4090. }
  4091. if (il == n_layer - 1 && inp_out_ids) {
  4092. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4093. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  4094. }
  4095. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  4096. cb(ffn_inp, "ffn_inp", il);
  4097. // feed-forward network (non-MoE)
  4098. if (model.layers[il].ffn_gate_inp == nullptr) {
  4099. cur = build_norm(ffn_inp,
  4100. model.layers[il].ffn_norm, NULL,
  4101. LLM_NORM_RMS, il);
  4102. cb(cur, "ffn_norm", il);
  4103. cur = build_ffn(cur,
  4104. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  4105. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  4106. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  4107. NULL,
  4108. LLM_FFN_SILU, LLM_FFN_PAR, il);
  4109. cb(cur, "ffn_out", il);
  4110. } else {
  4111. ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
  4112. model.layers[il].ffn_norm, NULL,
  4113. LLM_NORM_RMS, il);
  4114. cb(cur, "ffn_norm", il);
  4115. ggml_tensor * moe_out = build_moe_ffn(ffn_inp_normed,
  4116. model.layers[il].ffn_gate_inp,
  4117. model.layers[il].ffn_up_exps,
  4118. model.layers[il].ffn_gate_exps,
  4119. model.layers[il].ffn_down_exps,
  4120. nullptr,
  4121. n_expert, n_expert_used,
  4122. LLM_FFN_SILU, false,
  4123. false, 0.0,
  4124. LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
  4125. il);
  4126. // Shared experts
  4127. ggml_tensor * shexp_out = build_ffn(ffn_inp_normed,
  4128. model.layers[il].ffn_up_shexp, NULL, NULL,
  4129. model.layers[il].ffn_gate_shexp, NULL, NULL,
  4130. model.layers[il].ffn_down_shexp, NULL, NULL,
  4131. NULL,
  4132. LLM_FFN_SILU, LLM_FFN_PAR, il);
  4133. cb(shexp_out, "ffn_moe_shexp", il);
  4134. cur = ggml_add(ctx0, moe_out, shexp_out);
  4135. cb(cur, "ffn_moe_out_merged", il);
  4136. }
  4137. cur = ggml_add(ctx0, cur, ffn_inp);
  4138. cb(cur, "ffn_out", il);
  4139. cur = build_cvec(cur, il);
  4140. cb(cur, "l_out", il);
  4141. // input for next layer
  4142. inpL = cur;
  4143. }
  4144. cur = inpL;
  4145. cur = build_norm(cur,
  4146. model.output_norm, NULL,
  4147. LLM_NORM_RMS, -1);
  4148. cb(cur, "result_norm", -1);
  4149. res->t_embd = cur;
  4150. // lm_head
  4151. cur = build_lora_mm(model.output, cur);
  4152. cb(cur, "result_output", -1);
  4153. res->t_logits = cur;
  4154. ggml_build_forward_expand(gf, cur);
  4155. }
  4156. };
  4157. struct llm_build_deci : public llm_graph_context {
  4158. llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  4159. const int64_t n_embd_head = hparams.n_embd_head_v;
  4160. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4161. GGML_ASSERT(n_embd_head == hparams.n_rot);
  4162. ggml_tensor * cur;
  4163. ggml_tensor * inpL;
  4164. inpL = build_inp_embd(model.tok_embd);
  4165. // inp_pos - contains the positions
  4166. ggml_tensor * inp_pos = build_inp_pos();
  4167. auto * inp_attn = build_attn_inp_kv_unified();
  4168. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  4169. ggml_tensor * inp_out_ids = build_inp_out_ids();
  4170. for (int il = 0; il < n_layer; ++il) {
  4171. ggml_tensor * inpSA = inpL;
  4172. const int64_t n_head_kv = hparams.n_head_kv(il);
  4173. const int64_t n_head = hparams.n_head(il);
  4174. const int64_t n_ff = hparams.n_ff(il);
  4175. if (n_head == 0) {
  4176. // attention-free layer of Llama-3_1-Nemotron-51B
  4177. cur = inpL;
  4178. } else {
  4179. // norm
  4180. cur = build_norm(inpL,
  4181. model.layers[il].attn_norm, NULL,
  4182. LLM_NORM_RMS, il);
  4183. cb(cur, "attn_norm", il);
  4184. }
  4185. if (n_head > 0 && n_head_kv == 0) {
  4186. // "linear attention" of Llama-3_1-Nemotron-51B
  4187. cur = build_lora_mm(model.layers[il].wo, cur);
  4188. cb(cur, "wo", il);
  4189. } else if (n_head > 0) {
  4190. // self-attention
  4191. // rope freq factors for llama3; may return nullptr for llama2 and other models
  4192. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  4193. // compute Q and K and RoPE them
  4194. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  4195. cb(Qcur, "Qcur", il);
  4196. if (model.layers[il].bq) {
  4197. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  4198. cb(Qcur, "Qcur", il);
  4199. }
  4200. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  4201. cb(Kcur, "Kcur", il);
  4202. if (model.layers[il].bk) {
  4203. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  4204. cb(Kcur, "Kcur", il);
  4205. }
  4206. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  4207. cb(Vcur, "Vcur", il);
  4208. if (model.layers[il].bv) {
  4209. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  4210. cb(Vcur, "Vcur", il);
  4211. }
  4212. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4213. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4214. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4215. Qcur = ggml_rope_ext(
  4216. ctx0, Qcur, inp_pos, rope_factors,
  4217. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4218. ext_factor, attn_factor, beta_fast, beta_slow
  4219. );
  4220. Kcur = ggml_rope_ext(
  4221. ctx0, Kcur, inp_pos, rope_factors,
  4222. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4223. ext_factor, attn_factor, beta_fast, beta_slow
  4224. );
  4225. cb(Qcur, "Qcur", il);
  4226. cb(Kcur, "Kcur", il);
  4227. cb(Vcur, "Vcur", il);
  4228. cur = build_attn(inp_attn, gf,
  4229. model.layers[il].wo, model.layers[il].bo,
  4230. Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
  4231. }
  4232. if (il == n_layer - 1 && inp_out_ids) {
  4233. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4234. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  4235. }
  4236. // FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
  4237. if (n_ff == 0) {
  4238. continue;
  4239. }
  4240. // modified to support attention-free layer of Llama-3_1-Nemotron-51B
  4241. ggml_tensor * ffn_inp = cur;
  4242. if (n_head > 0) {
  4243. ffn_inp = ggml_add(ctx0, cur, inpSA);
  4244. cb(ffn_inp, "ffn_inp", il);
  4245. }
  4246. // feed-forward network
  4247. if (model.layers[il].ffn_gate_inp == nullptr) {
  4248. cur = build_norm(ffn_inp,
  4249. model.layers[il].ffn_norm, NULL,
  4250. LLM_NORM_RMS, il);
  4251. cb(cur, "ffn_norm", il);
  4252. cur = build_ffn(cur,
  4253. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  4254. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  4255. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  4256. NULL,
  4257. LLM_FFN_SILU, LLM_FFN_PAR, il);
  4258. cb(cur, "ffn_out", il);
  4259. }
  4260. cur = ggml_add(ctx0, cur, ffn_inp);
  4261. cb(cur, "ffn_out", il);
  4262. cur = build_cvec(cur, il);
  4263. cb(cur, "l_out", il);
  4264. // input for next layer
  4265. inpL = cur;
  4266. }
  4267. cur = inpL;
  4268. cur = build_norm(cur,
  4269. model.output_norm, NULL,
  4270. LLM_NORM_RMS, -1);
  4271. cb(cur, "result_norm", -1);
  4272. res->t_embd = cur;
  4273. // lm_head
  4274. cur = build_lora_mm(model.output, cur);
  4275. cb(cur, "result_output", -1);
  4276. res->t_logits = cur;
  4277. ggml_build_forward_expand(gf, cur);
  4278. }
  4279. };
  4280. struct llm_build_baichuan : public llm_graph_context {
  4281. llm_build_baichuan(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  4282. const int64_t n_embd_head = hparams.n_embd_head_v;
  4283. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4284. GGML_ASSERT(n_embd_head == hparams.n_rot);
  4285. ggml_tensor * cur;
  4286. ggml_tensor * inpL;
  4287. inpL = build_inp_embd(model.tok_embd);
  4288. // inp_pos - contains the positions
  4289. ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
  4290. auto * inp_attn = build_attn_inp_kv_unified();
  4291. ggml_tensor * inp_out_ids = build_inp_out_ids();
  4292. for (int il = 0; il < n_layer; ++il) {
  4293. ggml_tensor * inpSA = inpL;
  4294. cur = build_norm(inpL,
  4295. model.layers[il].attn_norm, NULL,
  4296. LLM_NORM_RMS, il);
  4297. cb(cur, "attn_norm", il);
  4298. // self-attention
  4299. {
  4300. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  4301. cb(Qcur, "Qcur", il);
  4302. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  4303. cb(Kcur, "Kcur", il);
  4304. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  4305. cb(Vcur, "Vcur", il);
  4306. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4307. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4308. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4309. switch (model.type) {
  4310. case LLM_TYPE_7B:
  4311. Qcur = ggml_rope_ext(
  4312. ctx0, Qcur, inp_pos, nullptr,
  4313. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4314. ext_factor, attn_factor, beta_fast, beta_slow
  4315. );
  4316. Kcur = ggml_rope_ext(
  4317. ctx0, Kcur, inp_pos, nullptr,
  4318. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4319. ext_factor, attn_factor, beta_fast, beta_slow
  4320. );
  4321. break;
  4322. case LLM_TYPE_13B:
  4323. break;
  4324. default:
  4325. GGML_ABORT("fatal error");
  4326. }
  4327. cb(Qcur, "Qcur", il);
  4328. cb(Kcur, "Kcur", il);
  4329. cb(Vcur, "Vcur", il);
  4330. cur = build_attn(inp_attn, gf,
  4331. model.layers[il].wo, NULL,
  4332. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  4333. }
  4334. if (il == n_layer - 1 && inp_out_ids) {
  4335. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4336. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  4337. }
  4338. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  4339. cb(ffn_inp, "ffn_inp", il);
  4340. // feed-forward network
  4341. {
  4342. cur = build_norm(ffn_inp,
  4343. model.layers[il].ffn_norm, NULL,
  4344. LLM_NORM_RMS, il);
  4345. cb(cur, "ffn_norm", il);
  4346. cur = build_ffn(cur,
  4347. model.layers[il].ffn_up, NULL, NULL,
  4348. model.layers[il].ffn_gate, NULL, NULL,
  4349. model.layers[il].ffn_down, NULL, NULL,
  4350. NULL,
  4351. LLM_FFN_SILU, LLM_FFN_PAR, il);
  4352. cb(cur, "ffn_out", il);
  4353. }
  4354. cur = ggml_add(ctx0, cur, ffn_inp);
  4355. cur = build_cvec(cur, il);
  4356. cb(cur, "l_out", il);
  4357. // input for next layer
  4358. inpL = cur;
  4359. }
  4360. cur = inpL;
  4361. cur = build_norm(cur,
  4362. model.output_norm, NULL,
  4363. LLM_NORM_RMS, -1);
  4364. cb(cur, "result_norm", -1);
  4365. res->t_embd = cur;
  4366. // lm_head
  4367. cur = build_lora_mm(model.output, cur);
  4368. cb(cur, "result_output", -1);
  4369. res->t_logits = cur;
  4370. ggml_build_forward_expand(gf, cur);
  4371. }
  4372. };
  4373. struct llm_build_xverse : public llm_graph_context {
  4374. llm_build_xverse(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  4375. const int64_t n_embd_head = hparams.n_embd_head_v;
  4376. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4377. GGML_ASSERT(n_embd_head == hparams.n_rot);
  4378. ggml_tensor * cur;
  4379. ggml_tensor * inpL;
  4380. inpL = build_inp_embd(model.tok_embd);
  4381. // inp_pos - contains the positions
  4382. ggml_tensor * inp_pos = build_inp_pos();
  4383. auto * inp_attn = build_attn_inp_kv_unified();
  4384. ggml_tensor * inp_out_ids = build_inp_out_ids();
  4385. for (int il = 0; il < n_layer; ++il) {
  4386. ggml_tensor * inpSA = inpL;
  4387. cur = build_norm(inpL,
  4388. model.layers[il].attn_norm, NULL,
  4389. LLM_NORM_RMS, il);
  4390. cb(cur, "attn_norm", il);
  4391. // self-attention
  4392. {
  4393. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  4394. cb(Qcur, "Qcur", il);
  4395. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  4396. cb(Kcur, "Kcur", il);
  4397. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  4398. cb(Vcur, "Vcur", il);
  4399. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4400. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4401. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4402. Qcur = ggml_rope_ext(
  4403. ctx0, Qcur, inp_pos, nullptr,
  4404. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4405. ext_factor, attn_factor, beta_fast, beta_slow
  4406. );
  4407. Kcur = ggml_rope_ext(
  4408. ctx0, Kcur, inp_pos, nullptr,
  4409. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4410. ext_factor, attn_factor, beta_fast, beta_slow
  4411. );
  4412. cb(Qcur, "Qcur", il);
  4413. cb(Kcur, "Kcur", il);
  4414. cb(Vcur, "Vcur", il);
  4415. cur = build_attn(inp_attn, gf,
  4416. model.layers[il].wo, NULL,
  4417. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  4418. }
  4419. if (il == n_layer - 1 && inp_out_ids) {
  4420. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4421. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  4422. }
  4423. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  4424. cb(ffn_inp, "ffn_inp", il);
  4425. // feed-forward network
  4426. {
  4427. cur = build_norm(ffn_inp,
  4428. model.layers[il].ffn_norm, NULL,
  4429. LLM_NORM_RMS, il);
  4430. cb(cur, "ffn_norm", il);
  4431. cur = build_ffn(cur,
  4432. model.layers[il].ffn_up, NULL, NULL,
  4433. model.layers[il].ffn_gate, NULL, NULL,
  4434. model.layers[il].ffn_down, NULL, NULL,
  4435. NULL,
  4436. LLM_FFN_SILU, LLM_FFN_PAR, il);
  4437. cb(cur, "ffn_out", il);
  4438. }
  4439. cur = ggml_add(ctx0, cur, ffn_inp);
  4440. cur = build_cvec(cur, il);
  4441. cb(cur, "l_out", il);
  4442. // input for next layer
  4443. inpL = cur;
  4444. }
  4445. cur = inpL;
  4446. cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
  4447. cb(cur, "result_norm", -1);
  4448. res->t_embd = cur;
  4449. // lm_head
  4450. cur = build_lora_mm(model.output, cur);
  4451. cb(cur, "result_output", -1);
  4452. res->t_logits = cur;
  4453. ggml_build_forward_expand(gf, cur);
  4454. }
  4455. };
  4456. struct llm_build_falcon : public llm_graph_context {
  4457. llm_build_falcon(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  4458. const int64_t n_embd_head = hparams.n_embd_head_v;
  4459. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  4460. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4461. GGML_ASSERT(n_embd_head == hparams.n_rot);
  4462. ggml_tensor * cur;
  4463. ggml_tensor * inpL;
  4464. inpL = build_inp_embd(model.tok_embd);
  4465. // inp_pos - contains the positions
  4466. ggml_tensor * inp_pos = build_inp_pos();
  4467. auto * inp_attn = build_attn_inp_kv_unified();
  4468. ggml_tensor * inp_out_ids = build_inp_out_ids();
  4469. for (int il = 0; il < n_layer; ++il) {
  4470. ggml_tensor * attn_norm;
  4471. attn_norm = build_norm(inpL,
  4472. model.layers[il].attn_norm,
  4473. model.layers[il].attn_norm_b,
  4474. LLM_NORM, il);
  4475. cb(attn_norm, "attn_norm", il);
  4476. // self-attention
  4477. {
  4478. if (model.layers[il].attn_norm_2) {
  4479. // Falcon-40B
  4480. cur = build_norm(inpL,
  4481. model.layers[il].attn_norm_2,
  4482. model.layers[il].attn_norm_2_b,
  4483. LLM_NORM, il);
  4484. cb(cur, "attn_norm_2", il);
  4485. } else {
  4486. cur = attn_norm;
  4487. }
  4488. cur = build_lora_mm(model.layers[il].wqkv, cur);
  4489. cb(cur, "wqkv", il);
  4490. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4491. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4492. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  4493. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4494. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4495. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4496. // using mode = 2 for neox mode
  4497. Qcur = ggml_rope_ext(
  4498. ctx0, Qcur, inp_pos, nullptr,
  4499. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4500. ext_factor, attn_factor, beta_fast, beta_slow
  4501. );
  4502. Kcur = ggml_rope_ext(
  4503. ctx0, Kcur, inp_pos, nullptr,
  4504. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4505. ext_factor, attn_factor, beta_fast, beta_slow
  4506. );
  4507. cb(Qcur, "Qcur", il);
  4508. cb(Kcur, "Kcur", il);
  4509. cb(Vcur, "Vcur", il);
  4510. cur = build_attn(inp_attn, gf,
  4511. model.layers[il].wo, NULL,
  4512. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  4513. }
  4514. if (il == n_layer - 1 && inp_out_ids) {
  4515. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4516. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  4517. attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
  4518. }
  4519. ggml_tensor * ffn_inp = cur;
  4520. // feed forward
  4521. {
  4522. cur = build_ffn(attn_norm, // !! use the attn norm, not the result
  4523. model.layers[il].ffn_up, NULL, NULL,
  4524. NULL, NULL, NULL,
  4525. model.layers[il].ffn_down, NULL, NULL,
  4526. NULL,
  4527. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  4528. cb(cur, "ffn_out", il);
  4529. }
  4530. cur = ggml_add(ctx0, cur, ffn_inp);
  4531. cur = ggml_add(ctx0, cur, inpL);
  4532. cur = build_cvec(cur, il);
  4533. cb(cur, "l_out", il);
  4534. // input for next layer
  4535. inpL = cur;
  4536. }
  4537. cur = inpL;
  4538. // norm
  4539. cur = build_norm(cur,
  4540. model.output_norm,
  4541. model.output_norm_b,
  4542. LLM_NORM, -1);
  4543. cb(cur, "result_norm", -1);
  4544. res->t_embd = cur;
  4545. cur = build_lora_mm(model.output, cur);
  4546. cb(cur, "result_output", -1);
  4547. res->t_logits = cur;
  4548. ggml_build_forward_expand(gf, cur);
  4549. }
  4550. };
  4551. struct llm_build_grok : public llm_graph_context {
  4552. llm_build_grok(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  4553. const int64_t n_embd_head = hparams.n_embd_head_v;
  4554. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4555. GGML_ASSERT(n_embd_head == hparams.n_rot);
  4556. ggml_tensor * cur;
  4557. ggml_tensor * inpL;
  4558. inpL = build_inp_embd(model.tok_embd);
  4559. // multiply by embedding_multiplier_scale of 78.38367176906169
  4560. inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
  4561. // inp_pos - contains the positions
  4562. ggml_tensor * inp_pos = build_inp_pos();
  4563. auto * inp_attn = build_attn_inp_kv_unified();
  4564. ggml_tensor * inp_out_ids = build_inp_out_ids();
  4565. for (int il = 0; il < n_layer; ++il) {
  4566. ggml_tensor * inpSA = inpL;
  4567. // norm
  4568. cur = build_norm(inpL,
  4569. model.layers[il].attn_norm, NULL,
  4570. LLM_NORM_RMS, il);
  4571. cb(cur, "attn_norm", il);
  4572. // self-attention
  4573. {
  4574. // compute Q and K and RoPE them
  4575. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  4576. cb(Qcur, "Qcur", il);
  4577. if (model.layers[il].bq) {
  4578. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  4579. cb(Qcur, "Qcur", il);
  4580. }
  4581. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  4582. cb(Kcur, "Kcur", il);
  4583. if (model.layers[il].bk) {
  4584. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  4585. cb(Kcur, "Kcur", il);
  4586. }
  4587. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  4588. cb(Vcur, "Vcur", il);
  4589. if (model.layers[il].bv) {
  4590. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  4591. cb(Vcur, "Vcur", il);
  4592. }
  4593. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4594. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4595. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4596. Qcur = ggml_rope_ext(
  4597. ctx0, Qcur, inp_pos, nullptr,
  4598. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4599. ext_factor, attn_factor, beta_fast, beta_slow
  4600. );
  4601. Kcur = ggml_rope_ext(
  4602. ctx0, Kcur, inp_pos, nullptr,
  4603. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4604. ext_factor, attn_factor, beta_fast, beta_slow
  4605. );
  4606. cb(Qcur, "Qcur", il);
  4607. cb(Kcur, "Kcur", il);
  4608. cb(Vcur, "Vcur", il);
  4609. cur = build_attn(inp_attn, gf,
  4610. model.layers[il].wo, model.layers[il].bo,
  4611. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
  4612. }
  4613. if (il == n_layer - 1 && inp_out_ids) {
  4614. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4615. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  4616. }
  4617. // Grok
  4618. // if attn_out_norm is present then apply it before adding the input
  4619. if (model.layers[il].attn_out_norm) {
  4620. cur = build_norm(cur,
  4621. model.layers[il].attn_out_norm, NULL,
  4622. LLM_NORM_RMS, il);
  4623. cb(cur, "attn_out_norm", il);
  4624. }
  4625. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  4626. cb(ffn_inp, "ffn_inp", il);
  4627. // feed-forward network
  4628. // MoE branch
  4629. cur = build_norm(ffn_inp,
  4630. model.layers[il].ffn_norm, NULL,
  4631. LLM_NORM_RMS, il);
  4632. cb(cur, "ffn_norm", il);
  4633. cur = build_moe_ffn(cur,
  4634. model.layers[il].ffn_gate_inp,
  4635. model.layers[il].ffn_up_exps,
  4636. model.layers[il].ffn_gate_exps,
  4637. model.layers[il].ffn_down_exps,
  4638. nullptr,
  4639. n_expert, n_expert_used,
  4640. LLM_FFN_GELU, true,
  4641. false, 0.0,
  4642. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  4643. il);
  4644. cb(cur, "ffn_moe_out", il);
  4645. // Grok
  4646. // if layer_out_norm is present then apply it before adding the input
  4647. // Idea: maybe ffn_out_norm is a better name
  4648. if (model.layers[il].layer_out_norm) {
  4649. cur = build_norm(cur,
  4650. model.layers[il].layer_out_norm, NULL,
  4651. LLM_NORM_RMS, il);
  4652. cb(cur, "layer_out_norm", il);
  4653. }
  4654. cur = ggml_add(ctx0, cur, ffn_inp);
  4655. cb(cur, "ffn_out", il);
  4656. cur = build_cvec(cur, il);
  4657. cb(cur, "l_out", il);
  4658. // input for next layer
  4659. inpL = cur;
  4660. }
  4661. cur = inpL;
  4662. cur = build_norm(cur,
  4663. model.output_norm, NULL,
  4664. LLM_NORM_RMS, -1);
  4665. cb(cur, "result_norm", -1);
  4666. res->t_embd = cur;
  4667. // lm_head
  4668. cur = build_lora_mm(model.output, cur);
  4669. // Grok
  4670. // multiply logits by output_multiplier_scale of 0.5773502691896257
  4671. cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
  4672. cb(cur, "result_output", -1);
  4673. res->t_logits = cur;
  4674. ggml_build_forward_expand(gf, cur);
  4675. }
  4676. };
  4677. struct llm_build_dbrx : public llm_graph_context {
  4678. llm_build_dbrx(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  4679. const int64_t n_embd_head = hparams.n_embd_head_v;
  4680. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  4681. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4682. GGML_ASSERT(n_embd_head == hparams.n_rot);
  4683. ggml_tensor * cur;
  4684. ggml_tensor * inpL;
  4685. inpL = build_inp_embd(model.tok_embd);
  4686. // inp_pos - contains the positions
  4687. ggml_tensor * inp_pos = build_inp_pos();
  4688. auto * inp_attn = build_attn_inp_kv_unified();
  4689. ggml_tensor * inp_out_ids = build_inp_out_ids();
  4690. for (int il = 0; il < n_layer; ++il) {
  4691. ggml_tensor * inpSA = inpL;
  4692. // norm
  4693. cur = build_norm(inpL,
  4694. model.layers[il].attn_norm, NULL,
  4695. LLM_NORM, il);
  4696. cb(cur, "attn_norm", il);
  4697. // self-attention
  4698. {
  4699. ggml_tensor * Qcur = nullptr;
  4700. ggml_tensor * Kcur = nullptr;
  4701. ggml_tensor * Vcur = nullptr;
  4702. cur = build_lora_mm(model.layers[il].wqkv, cur);
  4703. cb(cur, "wqkv", il);
  4704. cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  4705. cb(cur, "wqkv_clamped", il);
  4706. Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4707. Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4708. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  4709. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4710. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4711. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4712. Qcur = ggml_rope_ext(
  4713. ctx0, Qcur, inp_pos, nullptr,
  4714. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4715. ext_factor, attn_factor, beta_fast, beta_slow
  4716. );
  4717. Kcur = ggml_rope_ext(
  4718. ctx0, Kcur, inp_pos, nullptr,
  4719. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4720. ext_factor, attn_factor, beta_fast, beta_slow
  4721. );
  4722. cb(Qcur, "Qcur", il);
  4723. cb(Kcur, "Kcur", il);
  4724. cb(Vcur, "Vcur", il);
  4725. cur = build_attn(inp_attn, gf,
  4726. model.layers[il].wo, NULL,
  4727. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  4728. }
  4729. if (il == n_layer - 1 && inp_out_ids) {
  4730. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4731. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  4732. }
  4733. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  4734. cb(ffn_inp, "ffn_inp", il);
  4735. // feed-forward network
  4736. // MoE branch
  4737. cur = build_norm(ffn_inp,
  4738. model.layers[il].attn_out_norm, NULL,
  4739. LLM_NORM, il);
  4740. cb(cur, "attn_out_norm", il);
  4741. cur = build_moe_ffn(cur,
  4742. model.layers[il].ffn_gate_inp,
  4743. model.layers[il].ffn_up_exps,
  4744. model.layers[il].ffn_gate_exps,
  4745. model.layers[il].ffn_down_exps,
  4746. nullptr,
  4747. n_expert, n_expert_used,
  4748. LLM_FFN_SILU, true,
  4749. false, 0.0,
  4750. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  4751. il);
  4752. cb(cur, "ffn_moe_out", il);
  4753. cur = ggml_add(ctx0, cur, ffn_inp);
  4754. cb(cur, "ffn_out", il);
  4755. cur = build_cvec(cur, il);
  4756. cb(cur, "l_out", il);
  4757. // input for next layer
  4758. inpL = cur;
  4759. }
  4760. cur = inpL;
  4761. cur = build_norm(cur,
  4762. model.output_norm, NULL,
  4763. LLM_NORM, -1);
  4764. cb(cur, "result_norm", -1);
  4765. res->t_embd = cur;
  4766. // lm_head
  4767. cur = build_lora_mm(model.output, cur);
  4768. cb(cur, "result_output", -1);
  4769. res->t_logits = cur;
  4770. ggml_build_forward_expand(gf, cur);
  4771. }
  4772. };
  4773. struct llm_build_starcoder : public llm_graph_context {
  4774. llm_build_starcoder(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  4775. const int64_t n_embd_head = hparams.n_embd_head_v;
  4776. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  4777. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4778. ggml_tensor * cur;
  4779. ggml_tensor * inpL;
  4780. inpL = build_inp_embd(model.tok_embd);
  4781. // inp_pos - contains the positions
  4782. ggml_tensor * inp_pos = build_inp_pos();
  4783. auto * inp_attn = build_attn_inp_kv_unified();
  4784. ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
  4785. cb(pos, "pos_embd", -1);
  4786. inpL = ggml_add(ctx0, inpL, pos);
  4787. cb(inpL, "inpL", -1);
  4788. ggml_tensor * inp_out_ids = build_inp_out_ids();
  4789. for (int il = 0; il < n_layer; ++il) {
  4790. cur = build_norm(inpL,
  4791. model.layers[il].attn_norm,
  4792. model.layers[il].attn_norm_b,
  4793. LLM_NORM, il);
  4794. cb(cur, "attn_norm", il);
  4795. // self-attention
  4796. {
  4797. cur = build_lora_mm(model.layers[il].wqkv, cur);
  4798. cb(cur, "wqkv", il);
  4799. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  4800. cb(cur, "bqkv", il);
  4801. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4802. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4803. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  4804. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4805. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4806. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4807. cb(Qcur, "Qcur", il);
  4808. cb(Kcur, "Kcur", il);
  4809. cb(Vcur, "Vcur", il);
  4810. cur = build_attn(inp_attn, gf,
  4811. model.layers[il].wo, model.layers[il].bo,
  4812. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  4813. }
  4814. if (il == n_layer - 1 && inp_out_ids) {
  4815. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4816. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  4817. }
  4818. // add the input
  4819. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  4820. cb(ffn_inp, "ffn_inp", il);
  4821. // FF
  4822. {
  4823. cur = build_norm(ffn_inp,
  4824. model.layers[il].ffn_norm,
  4825. model.layers[il].ffn_norm_b,
  4826. LLM_NORM, il);
  4827. cb(cur, "ffn_norm", il);
  4828. cur = build_ffn(cur,
  4829. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  4830. NULL, NULL, NULL,
  4831. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  4832. NULL,
  4833. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  4834. cb(cur, "ffn_out", il);
  4835. }
  4836. cur = ggml_add(ctx0, cur, ffn_inp);
  4837. cur = build_cvec(cur, il);
  4838. cb(cur, "l_out", il);
  4839. // input for next layer
  4840. inpL = cur;
  4841. }
  4842. cur = build_norm(inpL,
  4843. model.output_norm,
  4844. model.output_norm_b,
  4845. LLM_NORM, -1);
  4846. cb(cur, "result_norm", -1);
  4847. res->t_embd = cur;
  4848. cur = build_lora_mm(model.output, cur);
  4849. cb(cur, "result_output", -1);
  4850. res->t_logits = cur;
  4851. ggml_build_forward_expand(gf, cur);
  4852. }
  4853. };
  4854. struct llm_build_refact : public llm_graph_context {
  4855. llm_build_refact(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  4856. const int64_t n_embd_head = hparams.n_embd_head_v;
  4857. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4858. ggml_tensor * cur;
  4859. ggml_tensor * inpL;
  4860. inpL = build_inp_embd(model.tok_embd);
  4861. auto * inp_attn = build_attn_inp_kv_unified();
  4862. ggml_tensor * inp_out_ids = build_inp_out_ids();
  4863. for (int il = 0; il < n_layer; ++il) {
  4864. ggml_tensor * inpSA = inpL;
  4865. cur = build_norm(inpL,
  4866. model.layers[il].attn_norm, NULL,
  4867. LLM_NORM_RMS, il);
  4868. cb(cur, "attn_norm", il);
  4869. // self-attention
  4870. {
  4871. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  4872. cb(Qcur, "Qcur", il);
  4873. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  4874. cb(Kcur, "Kcur", il);
  4875. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  4876. cb(Vcur, "Vcur", il);
  4877. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4878. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4879. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4880. cb(Qcur, "Qcur", il);
  4881. cb(Kcur, "Kcur", il);
  4882. cb(Vcur, "Vcur", il);
  4883. cur = build_attn(inp_attn, gf,
  4884. model.layers[il].wo, NULL,
  4885. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  4886. }
  4887. if (il == n_layer - 1 && inp_out_ids) {
  4888. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4889. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  4890. }
  4891. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  4892. cb(ffn_inp, "ffn_inp", il);
  4893. // feed-forward network
  4894. {
  4895. cur = build_norm(ffn_inp,
  4896. model.layers[il].ffn_norm, NULL,
  4897. LLM_NORM_RMS, il);
  4898. cb(cur, "ffn_norm", il);
  4899. cur = build_ffn(cur,
  4900. model.layers[il].ffn_up, NULL, NULL,
  4901. model.layers[il].ffn_gate, NULL, NULL,
  4902. model.layers[il].ffn_down, NULL, NULL,
  4903. NULL,
  4904. LLM_FFN_SILU, LLM_FFN_PAR, il);
  4905. cb(cur, "ffn_out", il);
  4906. }
  4907. cur = ggml_add(ctx0, cur, ffn_inp);
  4908. cur = build_cvec(cur, il);
  4909. cb(cur, "l_out", il);
  4910. // input for next layer
  4911. inpL = cur;
  4912. }
  4913. cur = inpL;
  4914. cur = build_norm(cur,
  4915. model.output_norm, NULL,
  4916. LLM_NORM_RMS, -1);
  4917. cb(cur, "result_norm", -1);
  4918. res->t_embd = cur;
  4919. // lm_head
  4920. cur = build_lora_mm(model.output, cur);
  4921. cb(cur, "result_output", -1);
  4922. res->t_logits = cur;
  4923. ggml_build_forward_expand(gf, cur);
  4924. }
  4925. };
  4926. struct llm_build_bert : public llm_graph_context {
  4927. llm_build_bert(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  4928. const int64_t n_embd_head = hparams.n_embd_head_v;
  4929. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  4930. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4931. ggml_tensor * cur;
  4932. ggml_tensor * inpL;
  4933. ggml_tensor * inp_pos = nullptr;
  4934. if (model.arch != LLM_ARCH_JINA_BERT_V2) {
  4935. inp_pos = build_inp_pos();
  4936. }
  4937. // construct input embeddings (token, type, position)
  4938. inpL = build_inp_embd(model.tok_embd);
  4939. // token types are hardcoded to zero ("Sentence A")
  4940. if (model.type_embd) {
  4941. ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
  4942. inpL = ggml_add(ctx0, inpL, type_row0);
  4943. }
  4944. if (model.arch == LLM_ARCH_BERT) {
  4945. inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
  4946. }
  4947. cb(inpL, "inp_embd", -1);
  4948. // embed layer norm
  4949. inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
  4950. cb(inpL, "inp_norm", -1);
  4951. auto * inp_attn = build_attn_inp_no_cache();
  4952. ggml_tensor * inp_out_ids = build_inp_out_ids();
  4953. for (int il = 0; il < n_layer; ++il) {
  4954. ggml_tensor * cur = inpL;
  4955. {
  4956. ggml_tensor * Qcur;
  4957. ggml_tensor * Kcur;
  4958. ggml_tensor * Vcur;
  4959. // self-attention
  4960. if (model.layers[il].wqkv) {
  4961. cur = build_lora_mm(model.layers[il].wqkv, cur);
  4962. cb(cur, "wqkv", il);
  4963. if (model.layers[il].bqkv) {
  4964. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  4965. cb(cur, "bqkv", il);
  4966. }
  4967. Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4968. Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4969. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  4970. } else {
  4971. Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
  4972. Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
  4973. Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
  4974. }
  4975. if (model.layers[il].attn_q_norm) {
  4976. Qcur = build_norm(Qcur,
  4977. model.layers[il].attn_q_norm,
  4978. model.layers[il].attn_q_norm_b,
  4979. LLM_NORM, il);
  4980. }
  4981. if (model.layers[il].attn_k_norm) {
  4982. Kcur = build_norm(Kcur,
  4983. model.layers[il].attn_k_norm,
  4984. model.layers[il].attn_k_norm_b,
  4985. LLM_NORM, il);
  4986. }
  4987. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4988. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4989. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4990. // RoPE
  4991. if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
  4992. Qcur = ggml_rope_ext(
  4993. ctx0, Qcur, inp_pos, nullptr,
  4994. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4995. ext_factor, attn_factor, beta_fast, beta_slow
  4996. );
  4997. Kcur = ggml_rope_ext(
  4998. ctx0, Kcur, inp_pos, nullptr,
  4999. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5000. ext_factor, attn_factor, beta_fast, beta_slow
  5001. );
  5002. }
  5003. cb(Qcur, "Qcur", il);
  5004. cb(Kcur, "Kcur", il);
  5005. cb(Vcur, "Vcur", il);
  5006. cur = build_attn(inp_attn, gf,
  5007. model.layers[il].wo, model.layers[il].bo,
  5008. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5009. cb(cur, "kqv_out", il);
  5010. }
  5011. if (il == n_layer - 1 && inp_out_ids) {
  5012. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5013. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  5014. }
  5015. // re-add the layer input
  5016. cur = ggml_add(ctx0, cur, inpL);
  5017. // attention layer norm
  5018. cur = build_norm(cur, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, il);
  5019. if (model.layers[il].attn_norm_2 != nullptr) {
  5020. cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
  5021. cur = build_norm(cur, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, il);
  5022. }
  5023. ggml_tensor * ffn_inp = cur;
  5024. cb(ffn_inp, "ffn_inp", il);
  5025. // feed-forward network
  5026. if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
  5027. // MoE branch
  5028. cur = build_moe_ffn(cur,
  5029. model.layers[il].ffn_gate_inp,
  5030. model.layers[il].ffn_up_exps,
  5031. nullptr,
  5032. model.layers[il].ffn_down_exps,
  5033. nullptr,
  5034. hparams.n_expert,
  5035. hparams.n_expert_used,
  5036. LLM_FFN_GELU,
  5037. false, false,
  5038. 0.0f,
  5039. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
  5040. cb(cur, "ffn_moe_out", il);
  5041. } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
  5042. cur = build_ffn(cur,
  5043. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  5044. NULL, NULL, NULL,
  5045. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  5046. NULL,
  5047. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  5048. cb(cur, "ffn_out", il);
  5049. } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
  5050. cur = build_ffn(cur,
  5051. model.layers[il].ffn_up, NULL, NULL,
  5052. model.layers[il].ffn_gate, NULL, NULL,
  5053. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  5054. NULL,
  5055. model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
  5056. cb(cur, "ffn_out", il);
  5057. } else {
  5058. cur = build_ffn(cur,
  5059. model.layers[il].ffn_up, NULL, NULL,
  5060. model.layers[il].ffn_gate, NULL, NULL,
  5061. model.layers[il].ffn_down, NULL, NULL,
  5062. NULL,
  5063. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5064. cb(cur, "ffn_out", il);
  5065. }
  5066. // attentions bypass the intermediate layer
  5067. cur = ggml_add(ctx0, cur, ffn_inp);
  5068. // output layer norm
  5069. cur = build_norm(cur, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, il);
  5070. // input for next layer
  5071. inpL = cur;
  5072. }
  5073. cur = inpL;
  5074. cb(cur, "result_embd", -1);
  5075. res->t_embd = cur;
  5076. ggml_build_forward_expand(gf, cur);
  5077. }
  5078. };
  5079. struct llm_build_neo_bert : public llm_graph_context {
  5080. llm_build_neo_bert(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  5081. const int64_t n_embd_head = hparams.n_embd_head_v;
  5082. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  5083. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5084. ggml_tensor * cur;
  5085. ggml_tensor * inpL;
  5086. ggml_tensor * inp_pos = build_inp_pos();
  5087. // construct input embeddings (token, type, position)
  5088. inpL = build_inp_embd(model.tok_embd);
  5089. cb(inpL, "inp_embd", -1);
  5090. auto * inp_attn = build_attn_inp_no_cache();
  5091. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5092. for (int il = 0; il < n_layer; ++il) {
  5093. ggml_tensor * cur = inpL;
  5094. // pre-norm
  5095. cur = build_norm(inpL,
  5096. model.layers[il].attn_norm, NULL,
  5097. LLM_NORM_RMS, il);
  5098. {
  5099. ggml_tensor * Qcur;
  5100. ggml_tensor * Kcur;
  5101. ggml_tensor * Vcur;
  5102. // self-attention
  5103. cur = build_lora_mm(model.layers[il].wqkv, cur);
  5104. cb(cur, "wqkv", il);
  5105. Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  5106. Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  5107. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  5108. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5109. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5110. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5111. // RoPE
  5112. Qcur = ggml_rope_ext(
  5113. ctx0, Qcur, inp_pos, nullptr,
  5114. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5115. ext_factor, attn_factor, beta_fast, beta_slow
  5116. );
  5117. Kcur = ggml_rope_ext(
  5118. ctx0, Kcur, inp_pos, nullptr,
  5119. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5120. ext_factor, attn_factor, beta_fast, beta_slow
  5121. );
  5122. cb(Qcur, "Qcur", il);
  5123. cb(Kcur, "Kcur", il);
  5124. cb(Vcur, "Vcur", il);
  5125. cur = build_attn(inp_attn, gf,
  5126. model.layers[il].wo, nullptr,
  5127. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5128. cb(cur, "kqv_out", il);
  5129. }
  5130. if (il == n_layer - 1 && inp_out_ids) {
  5131. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5132. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  5133. }
  5134. // re-add the layer input
  5135. cur = ggml_add(ctx0, cur, inpL);
  5136. ggml_tensor * ffn_inp = cur;
  5137. cb(ffn_inp, "ffn_inp", il);
  5138. // pre-norm
  5139. cur = build_norm(ffn_inp,
  5140. model.layers[il].ffn_norm, NULL,
  5141. LLM_NORM_RMS, il);
  5142. cb(cur, "ffn_norm", il);
  5143. // feed-forward network
  5144. cur = build_ffn(cur,
  5145. model.layers[il].ffn_up,
  5146. NULL, NULL, NULL, NULL, NULL,
  5147. model.layers[il].ffn_down,
  5148. NULL, NULL, NULL,
  5149. LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
  5150. // attentions bypass the intermediate layer
  5151. cur = ggml_add(ctx0, cur, ffn_inp);
  5152. // input for next layer
  5153. inpL = cur;
  5154. }
  5155. cur = inpL;
  5156. cur = build_norm(cur,
  5157. model.output_norm_enc, NULL,
  5158. LLM_NORM_RMS, -1);
  5159. cb(cur, "result_embd", -1);
  5160. res->t_embd = cur;
  5161. ggml_build_forward_expand(gf, cur);
  5162. }
  5163. };
  5164. struct llm_build_bloom : public llm_graph_context {
  5165. llm_build_bloom(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  5166. const int64_t n_embd_head = hparams.n_embd_head_v;
  5167. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  5168. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5169. ggml_tensor * cur;
  5170. ggml_tensor * inpL;
  5171. inpL = build_inp_embd(model.tok_embd);
  5172. auto * inp_attn = build_attn_inp_kv_unified();
  5173. inpL = build_norm(inpL,
  5174. model.tok_norm,
  5175. model.tok_norm_b,
  5176. LLM_NORM, -1);
  5177. cb(inpL, "inp_norm", -1);
  5178. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5179. for (int il = 0; il < n_layer; ++il) {
  5180. cur = build_norm(inpL,
  5181. model.layers[il].attn_norm,
  5182. model.layers[il].attn_norm_b,
  5183. LLM_NORM, il);
  5184. cb(cur, "attn_norm", il);
  5185. // self-attention
  5186. {
  5187. cur = build_lora_mm(model.layers[il].wqkv, cur);
  5188. cb(cur, "wqkv", il);
  5189. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  5190. cb(cur, "bqkv", il);
  5191. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  5192. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  5193. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  5194. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5195. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5196. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5197. cb(Qcur, "Qcur", il);
  5198. cb(Kcur, "Kcur", il);
  5199. cb(Vcur, "Vcur", il);
  5200. cur = build_attn(inp_attn, gf,
  5201. model.layers[il].wo, model.layers[il].bo,
  5202. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5203. }
  5204. if (il == n_layer - 1 && inp_out_ids) {
  5205. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5206. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  5207. }
  5208. // Add the input
  5209. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  5210. cb(ffn_inp, "ffn_inp", il);
  5211. // FF
  5212. {
  5213. cur = build_norm(ffn_inp,
  5214. model.layers[il].ffn_norm,
  5215. model.layers[il].ffn_norm_b,
  5216. LLM_NORM, il);
  5217. cb(cur, "ffn_norm", il);
  5218. cur = build_ffn(cur,
  5219. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  5220. NULL, NULL, NULL,
  5221. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  5222. NULL,
  5223. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  5224. cb(cur, "ffn_out", il);
  5225. }
  5226. cur = ggml_add(ctx0, cur, ffn_inp);
  5227. cur = build_cvec(cur, il);
  5228. cb(cur, "l_out", il);
  5229. // input for next layer
  5230. inpL = cur;
  5231. }
  5232. cur = build_norm(inpL,
  5233. model.output_norm,
  5234. model.output_norm_b,
  5235. LLM_NORM, -1);
  5236. cb(cur, "result_norm", -1);
  5237. res->t_embd = cur;
  5238. cur = build_lora_mm(model.output, cur);
  5239. cb(cur, "result_output", -1);
  5240. res->t_logits = cur;
  5241. ggml_build_forward_expand(gf, cur);
  5242. }
  5243. };
  5244. struct llm_build_mpt : public llm_graph_context {
  5245. llm_build_mpt(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  5246. const int64_t n_embd_head = hparams.n_embd_head_v;
  5247. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  5248. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5249. ggml_tensor * cur;
  5250. ggml_tensor * pos;
  5251. ggml_tensor * inpL;
  5252. inpL = build_inp_embd(model.tok_embd);
  5253. auto * inp_attn = build_attn_inp_kv_unified();
  5254. if (model.pos_embd) {
  5255. // inp_pos - contains the positions
  5256. ggml_tensor * inp_pos = build_inp_pos();
  5257. pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
  5258. cb(pos, "pos_embd", -1);
  5259. inpL = ggml_add(ctx0, inpL, pos);
  5260. cb(inpL, "inpL", -1);
  5261. }
  5262. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5263. for (int il = 0; il < n_layer; ++il) {
  5264. ggml_tensor * attn_norm;
  5265. attn_norm = build_norm(inpL,
  5266. model.layers[il].attn_norm,
  5267. model.layers[il].attn_norm_b,
  5268. LLM_NORM, il);
  5269. cb(attn_norm, "attn_norm", il);
  5270. // self-attention
  5271. {
  5272. cur = attn_norm;
  5273. cur = build_lora_mm(model.layers[il].wqkv, cur);
  5274. cb(cur, "wqkv", il);
  5275. if (model.layers[il].bqkv){
  5276. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  5277. cb(cur, "bqkv", il);
  5278. }
  5279. if (hparams.f_clamp_kqv > 0.0f) {
  5280. cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  5281. cb(cur, "wqkv_clamped", il);
  5282. }
  5283. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  5284. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  5285. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  5286. cb(Qcur, "Qcur", il);
  5287. cb(Kcur, "Kcur", il);
  5288. cb(Vcur, "Vcur", il);
  5289. // Q/K Layernorm
  5290. if (model.layers[il].attn_q_norm) {
  5291. Qcur = build_norm(Qcur,
  5292. model.layers[il].attn_q_norm,
  5293. model.layers[il].attn_q_norm_b,
  5294. LLM_NORM, il);
  5295. cb(Qcur, "Qcur", il);
  5296. Kcur = build_norm(Kcur,
  5297. model.layers[il].attn_k_norm,
  5298. model.layers[il].attn_k_norm_b,
  5299. LLM_NORM, il);
  5300. cb(Kcur, "Kcur", il);
  5301. }
  5302. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5303. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5304. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5305. cb(Qcur, "Qcur", il);
  5306. cb(Kcur, "Kcur", il);
  5307. cb(Vcur, "Vcur", il);
  5308. cur = build_attn(inp_attn, gf,
  5309. model.layers[il].wo, model.layers[il].bo,
  5310. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5311. }
  5312. if (il == n_layer - 1 && inp_out_ids) {
  5313. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5314. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  5315. }
  5316. // Add the input
  5317. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  5318. cb(ffn_inp, "ffn_inp", il);
  5319. // feed forward
  5320. {
  5321. cur = build_norm(ffn_inp,
  5322. model.layers[il].ffn_norm,
  5323. model.layers[il].ffn_norm_b,
  5324. LLM_NORM, il);
  5325. cb(cur, "ffn_norm", il);
  5326. cur = build_ffn(cur,
  5327. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  5328. NULL, NULL, NULL,
  5329. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  5330. model.layers[il].ffn_act,
  5331. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  5332. cb(cur, "ffn_out", il);
  5333. }
  5334. cur = ggml_add(ctx0, cur, ffn_inp);
  5335. cur = build_cvec(cur, il);
  5336. cb(cur, "l_out", il);
  5337. // input for next layer
  5338. inpL = cur;
  5339. }
  5340. cur = inpL;
  5341. cur = build_norm(cur,
  5342. model.output_norm,
  5343. model.output_norm_b,
  5344. LLM_NORM, -1);
  5345. cb(cur, "result_norm", -1);
  5346. res->t_embd = cur;
  5347. cur = build_lora_mm(model.output, cur);
  5348. cb(cur, "result_output", -1);
  5349. res->t_logits = cur;
  5350. ggml_build_forward_expand(gf, cur);
  5351. }
  5352. };
  5353. struct llm_build_stablelm : public llm_graph_context {
  5354. llm_build_stablelm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  5355. const int64_t n_embd_head = hparams.n_embd_head_v;
  5356. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5357. ggml_tensor * cur;
  5358. ggml_tensor * inpL;
  5359. inpL = build_inp_embd(model.tok_embd);
  5360. // inp_pos - contains the positions
  5361. ggml_tensor * inp_pos = build_inp_pos();
  5362. auto * inp_attn = build_attn_inp_kv_unified();
  5363. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5364. for (int il = 0; il < n_layer; ++il) {
  5365. // norm
  5366. cur = build_norm(inpL,
  5367. model.layers[il].attn_norm,
  5368. model.layers[il].attn_norm_b,
  5369. LLM_NORM, il);
  5370. cb(cur, "attn_norm", il);
  5371. ggml_tensor * inpSA = cur;
  5372. // self-attention
  5373. {
  5374. // compute Q and K and RoPE them
  5375. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5376. cb(Qcur, "Qcur", il);
  5377. if (model.layers[il].bq) {
  5378. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  5379. cb(Qcur, "Qcur", il);
  5380. }
  5381. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5382. cb(Kcur, "Kcur", il);
  5383. if (model.layers[il].bk) {
  5384. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  5385. cb(Kcur, "Kcur", il);
  5386. }
  5387. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5388. cb(Vcur, "Vcur", il);
  5389. if (model.layers[il].bv) {
  5390. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  5391. cb(Vcur, "Vcur", il);
  5392. }
  5393. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5394. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5395. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5396. if (model.layers[il].attn_q_norm) {
  5397. Qcur = build_norm(Qcur,
  5398. model.layers[il].attn_q_norm,
  5399. NULL,
  5400. LLM_NORM, il);
  5401. cb(Qcur, "Qcur", il);
  5402. }
  5403. if (model.layers[il].attn_k_norm) {
  5404. Kcur = build_norm(Kcur,
  5405. model.layers[il].attn_k_norm,
  5406. NULL,
  5407. LLM_NORM, il);
  5408. cb(Kcur, "Kcur", il);
  5409. }
  5410. Qcur = ggml_rope_ext(
  5411. ctx0, Qcur, inp_pos, nullptr,
  5412. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5413. ext_factor, attn_factor, beta_fast, beta_slow
  5414. );
  5415. Kcur = ggml_rope_ext(
  5416. ctx0, Kcur, inp_pos, nullptr,
  5417. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5418. ext_factor, attn_factor, beta_fast, beta_slow
  5419. );
  5420. cb(Qcur, "Qcur", il);
  5421. cb(Kcur, "Kcur", il);
  5422. cb(Vcur, "Vcur", il);
  5423. cur = build_attn(inp_attn, gf,
  5424. model.layers[il].wo, NULL,
  5425. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5426. }
  5427. if (il == n_layer - 1 && inp_out_ids) {
  5428. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5429. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  5430. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5431. }
  5432. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  5433. cb(ffn_inp, "ffn_inp", il);
  5434. // feed-forward network
  5435. {
  5436. if (model.layers[il].ffn_norm) {
  5437. cur = build_norm(ffn_inp,
  5438. model.layers[il].ffn_norm,
  5439. model.layers[il].ffn_norm_b,
  5440. LLM_NORM, il);
  5441. cb(cur, "ffn_norm", il);
  5442. } else {
  5443. // parallel residual
  5444. cur = inpSA;
  5445. }
  5446. cur = build_ffn(cur,
  5447. model.layers[il].ffn_up, NULL, NULL,
  5448. model.layers[il].ffn_gate, NULL, NULL,
  5449. model.layers[il].ffn_down, NULL, NULL,
  5450. NULL,
  5451. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5452. cb(cur, "ffn_out", il);
  5453. }
  5454. cur = ggml_add(ctx0, cur, ffn_inp);
  5455. cur = build_cvec(cur, il);
  5456. cb(cur, "l_out", il);
  5457. // input for next layer
  5458. inpL = cur;
  5459. }
  5460. cur = inpL;
  5461. cur = build_norm(cur,
  5462. model.output_norm,
  5463. model.output_norm_b,
  5464. LLM_NORM, -1);
  5465. cb(cur, "result_norm", -1);
  5466. res->t_embd = cur;
  5467. // lm_head
  5468. cur = build_lora_mm(model.output, cur);
  5469. cb(cur, "result_output", -1);
  5470. res->t_logits = cur;
  5471. ggml_build_forward_expand(gf, cur);
  5472. }
  5473. };
  5474. struct llm_build_qwen : public llm_graph_context {
  5475. llm_build_qwen(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  5476. const int64_t n_embd_head = hparams.n_embd_head_v;
  5477. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5478. ggml_tensor * cur;
  5479. ggml_tensor * inpL;
  5480. inpL = build_inp_embd(model.tok_embd);
  5481. // inp_pos - contains the positions
  5482. ggml_tensor * inp_pos = build_inp_pos();
  5483. auto * inp_attn = build_attn_inp_kv_unified();
  5484. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5485. for (int il = 0; il < n_layer; ++il) {
  5486. ggml_tensor * inpSA = inpL;
  5487. cur = build_norm(inpL,
  5488. model.layers[il].attn_norm, NULL,
  5489. LLM_NORM_RMS, il);
  5490. cb(cur, "attn_norm", il);
  5491. // self-attention
  5492. {
  5493. cur = build_lora_mm(model.layers[il].wqkv, cur);
  5494. cb(cur, "wqkv", il);
  5495. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  5496. cb(cur, "bqkv", il);
  5497. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  5498. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  5499. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
  5500. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5501. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5502. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5503. // using mode = 2 for neox mode
  5504. Qcur = ggml_rope_ext(
  5505. ctx0, Qcur, inp_pos, nullptr,
  5506. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5507. ext_factor, attn_factor, beta_fast, beta_slow
  5508. );
  5509. Kcur = ggml_rope_ext(
  5510. ctx0, Kcur, inp_pos, nullptr,
  5511. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5512. ext_factor, attn_factor, beta_fast, beta_slow
  5513. );
  5514. cb(Qcur, "Qcur", il);
  5515. cb(Kcur, "Kcur", il);
  5516. cb(Vcur, "Vcur", il);
  5517. cur = build_attn(inp_attn, gf,
  5518. model.layers[il].wo, NULL,
  5519. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5520. }
  5521. if (il == n_layer - 1 && inp_out_ids) {
  5522. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5523. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5524. }
  5525. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5526. cb(ffn_inp, "ffn_inp", il);
  5527. // feed-forward forward
  5528. {
  5529. cur = build_norm(ffn_inp,
  5530. model.layers[il].ffn_norm, NULL,
  5531. LLM_NORM_RMS, il);
  5532. cb(cur, "ffn_norm", il);
  5533. cur = build_ffn(cur,
  5534. model.layers[il].ffn_up, NULL, NULL,
  5535. model.layers[il].ffn_gate, NULL, NULL,
  5536. model.layers[il].ffn_down, NULL, NULL,
  5537. NULL,
  5538. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5539. cb(cur, "ffn_out", il);
  5540. }
  5541. cur = ggml_add(ctx0, cur, ffn_inp);
  5542. cur = build_cvec(cur, il);
  5543. cb(cur, "l_out", il);
  5544. // input for next layer
  5545. inpL = cur;
  5546. }
  5547. cur = inpL;
  5548. cur = build_norm(cur,
  5549. model.output_norm, NULL,
  5550. LLM_NORM_RMS, -1);
  5551. cb(cur, "result_norm", -1);
  5552. res->t_embd = cur;
  5553. // lm_head
  5554. cur = build_lora_mm(model.output, cur);
  5555. cb(cur, "result_output", -1);
  5556. res->t_logits = cur;
  5557. ggml_build_forward_expand(gf, cur);
  5558. }
  5559. };
  5560. struct llm_build_qwen2 : public llm_graph_context {
  5561. llm_build_qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  5562. const int64_t n_embd_head = hparams.n_embd_head_v;
  5563. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5564. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5565. ggml_tensor * cur;
  5566. ggml_tensor * inpL;
  5567. inpL = build_inp_embd(model.tok_embd);
  5568. // inp_pos - contains the positions
  5569. ggml_tensor * inp_pos = build_inp_pos();
  5570. auto * inp_attn = build_attn_inp_kv_unified();
  5571. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5572. for (int il = 0; il < n_layer; ++il) {
  5573. ggml_tensor * inpSA = inpL;
  5574. // norm
  5575. cur = build_norm(inpL,
  5576. model.layers[il].attn_norm, NULL,
  5577. LLM_NORM_RMS, il);
  5578. cb(cur, "attn_norm", il);
  5579. // self-attention
  5580. {
  5581. // compute Q and K and RoPE them
  5582. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5583. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  5584. cb(Qcur, "Qcur", il);
  5585. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5586. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  5587. cb(Kcur, "Kcur", il);
  5588. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5589. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  5590. cb(Vcur, "Vcur", il);
  5591. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5592. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5593. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5594. Qcur = ggml_rope_ext(
  5595. ctx0, Qcur, inp_pos, nullptr,
  5596. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5597. ext_factor, attn_factor, beta_fast, beta_slow
  5598. );
  5599. Kcur = ggml_rope_ext(
  5600. ctx0, Kcur, inp_pos, nullptr,
  5601. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5602. ext_factor, attn_factor, beta_fast, beta_slow
  5603. );
  5604. cb(Qcur, "Qcur", il);
  5605. cb(Kcur, "Kcur", il);
  5606. cb(Vcur, "Vcur", il);
  5607. cur = build_attn(inp_attn, gf,
  5608. model.layers[il].wo, model.layers[il].bo,
  5609. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5610. }
  5611. if (il == n_layer - 1 && inp_out_ids) {
  5612. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5613. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5614. }
  5615. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5616. cb(ffn_inp, "ffn_inp", il);
  5617. // feed-forward network
  5618. cur = build_norm(ffn_inp,
  5619. model.layers[il].ffn_norm, NULL,
  5620. LLM_NORM_RMS, il);
  5621. cb(cur, "ffn_norm", il);
  5622. cur = build_ffn(cur,
  5623. model.layers[il].ffn_up, NULL, NULL,
  5624. model.layers[il].ffn_gate, NULL, NULL,
  5625. model.layers[il].ffn_down, NULL, NULL,
  5626. NULL,
  5627. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5628. cb(cur, "ffn_out", il);
  5629. cur = ggml_add(ctx0, cur, ffn_inp);
  5630. cur = build_cvec(cur, il);
  5631. cb(cur, "l_out", il);
  5632. // input for next layer
  5633. inpL = cur;
  5634. }
  5635. cur = inpL;
  5636. cur = build_norm(cur,
  5637. model.output_norm, NULL,
  5638. LLM_NORM_RMS, -1);
  5639. cb(cur, "result_norm", -1);
  5640. res->t_embd = cur;
  5641. // lm_head
  5642. cur = build_lora_mm(model.output, cur);
  5643. cb(cur, "result_output", -1);
  5644. res->t_logits = cur;
  5645. ggml_build_forward_expand(gf, cur);
  5646. }
  5647. };
  5648. struct llm_build_qwen2vl : public llm_graph_context {
  5649. llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  5650. const int64_t n_embd_head = hparams.n_embd_head_v;
  5651. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5652. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5653. ggml_tensor * cur;
  5654. ggml_tensor * inpL;
  5655. inpL = build_inp_embd(model.tok_embd);
  5656. // inp_pos - contains the positions
  5657. ggml_tensor * inp_pos = build_inp_pos();
  5658. auto * inp_attn = build_attn_inp_kv_unified();
  5659. int sections[4];
  5660. std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
  5661. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5662. for (int il = 0; il < n_layer; ++il) {
  5663. ggml_tensor * inpSA = inpL;
  5664. // norm
  5665. cur = build_norm(inpL,
  5666. model.layers[il].attn_norm, NULL,
  5667. LLM_NORM_RMS, il);
  5668. cb(cur, "attn_norm", il);
  5669. // self-attention
  5670. {
  5671. // compute Q and K and RoPE them
  5672. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5673. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  5674. cb(Qcur, "Qcur", il);
  5675. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5676. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  5677. cb(Kcur, "Kcur", il);
  5678. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5679. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  5680. cb(Vcur, "Vcur", il);
  5681. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5682. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5683. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5684. Qcur = ggml_rope_multi(
  5685. ctx0, Qcur, inp_pos, nullptr,
  5686. n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
  5687. ext_factor, attn_factor, beta_fast, beta_slow
  5688. );
  5689. Kcur = ggml_rope_multi(
  5690. ctx0, Kcur, inp_pos, nullptr,
  5691. n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
  5692. ext_factor, attn_factor, beta_fast, beta_slow
  5693. );
  5694. cb(Qcur, "Qcur", il);
  5695. cb(Kcur, "Kcur", il);
  5696. cb(Vcur, "Vcur", il);
  5697. cur = build_attn(inp_attn, gf,
  5698. model.layers[il].wo, model.layers[il].bo,
  5699. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5700. }
  5701. if (il == n_layer - 1 && inp_out_ids) {
  5702. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5703. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5704. }
  5705. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5706. cb(ffn_inp, "ffn_inp", il);
  5707. // feed-forward network
  5708. cur = build_norm(ffn_inp,
  5709. model.layers[il].ffn_norm, NULL,
  5710. LLM_NORM_RMS, il);
  5711. cb(cur, "ffn_norm", il);
  5712. cur = build_ffn(cur,
  5713. model.layers[il].ffn_up, NULL, NULL,
  5714. model.layers[il].ffn_gate, NULL, NULL,
  5715. model.layers[il].ffn_down, NULL, NULL,
  5716. NULL,
  5717. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5718. cb(cur, "ffn_out", il);
  5719. cur = ggml_add(ctx0, cur, ffn_inp);
  5720. cur = build_cvec(cur, il);
  5721. cb(cur, "l_out", il);
  5722. // input for next layer
  5723. inpL = cur;
  5724. }
  5725. cur = inpL;
  5726. cur = build_norm(cur,
  5727. model.output_norm, NULL,
  5728. LLM_NORM_RMS, -1);
  5729. cb(cur, "result_norm", -1);
  5730. res->t_embd = cur;
  5731. // lm_head
  5732. cur = build_lora_mm(model.output, cur);
  5733. cb(cur, "result_output", -1);
  5734. res->t_logits = cur;
  5735. ggml_build_forward_expand(gf, cur);
  5736. }
  5737. };
  5738. struct llm_build_qwen2moe : public llm_graph_context {
  5739. llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  5740. const int64_t n_embd_head = hparams.n_embd_head_v;
  5741. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5742. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5743. ggml_tensor * cur;
  5744. ggml_tensor * inpL;
  5745. inpL = build_inp_embd(model.tok_embd);
  5746. // inp_pos - contains the positions
  5747. ggml_tensor * inp_pos = build_inp_pos();
  5748. auto * inp_attn = build_attn_inp_kv_unified();
  5749. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5750. for (int il = 0; il < n_layer; ++il) {
  5751. ggml_tensor * inpSA = inpL;
  5752. // norm
  5753. cur = build_norm(inpL,
  5754. model.layers[il].attn_norm, NULL,
  5755. LLM_NORM_RMS, il);
  5756. cb(cur, "attn_norm", il);
  5757. // self_attention
  5758. {
  5759. // compute Q and K and RoPE them
  5760. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5761. cb(Qcur, "Qcur", il);
  5762. if (model.layers[il].bq) {
  5763. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  5764. cb(Qcur, "Qcur", il);
  5765. }
  5766. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5767. cb(Kcur, "Kcur", il);
  5768. if (model.layers[il].bk) {
  5769. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  5770. cb(Kcur, "Kcur", il);
  5771. }
  5772. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5773. cb(Vcur, "Vcur", il);
  5774. if (model.layers[il].bv) {
  5775. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  5776. cb(Vcur, "Vcur", il);
  5777. }
  5778. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5779. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5780. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5781. Qcur = ggml_rope_ext(
  5782. ctx0, Qcur, inp_pos, nullptr,
  5783. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5784. ext_factor, attn_factor, beta_fast, beta_slow
  5785. );
  5786. Kcur = ggml_rope_ext(
  5787. ctx0, Kcur, inp_pos, nullptr,
  5788. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5789. ext_factor, attn_factor, beta_fast, beta_slow
  5790. );
  5791. cb(Qcur, "Qcur", il);
  5792. cb(Kcur, "Kcur", il);
  5793. cb(Vcur, "Vcur", il);
  5794. cur = build_attn(inp_attn, gf,
  5795. model.layers[il].wo, model.layers[il].bo,
  5796. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5797. }
  5798. if (il == n_layer - 1 && inp_out_ids) {
  5799. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5800. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5801. }
  5802. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5803. cb(ffn_inp, "ffn_inp", il);
  5804. // MoE branch
  5805. cur = build_norm(ffn_inp,
  5806. model.layers[il].ffn_norm, NULL,
  5807. LLM_NORM_RMS, il);
  5808. cb(cur, "ffn_norm", il);
  5809. ggml_tensor * moe_out =
  5810. build_moe_ffn(cur,
  5811. model.layers[il].ffn_gate_inp,
  5812. model.layers[il].ffn_up_exps,
  5813. model.layers[il].ffn_gate_exps,
  5814. model.layers[il].ffn_down_exps,
  5815. nullptr,
  5816. n_expert, n_expert_used,
  5817. LLM_FFN_SILU, false,
  5818. false, 0.0,
  5819. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  5820. il);
  5821. cb(moe_out, "ffn_moe_out", il);
  5822. // FFN shared expert
  5823. {
  5824. ggml_tensor * cur_gate_inp = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur);
  5825. cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
  5826. // sigmoid
  5827. ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
  5828. cb(cur_gate, "ffn_shexp_gate", il);
  5829. ggml_tensor * cur_ffn = build_ffn(cur,
  5830. model.layers[il].ffn_up_shexp, NULL, NULL,
  5831. model.layers[il].ffn_gate_shexp, NULL, NULL,
  5832. model.layers[il].ffn_down_shexp, NULL, NULL,
  5833. NULL,
  5834. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5835. cb(cur_ffn, "ffn_shexp", il);
  5836. ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
  5837. cb(ffn_shexp_out, "ffn_shexp_out", il);
  5838. moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
  5839. cb(moe_out, "ffn_out", il);
  5840. cur = moe_out;
  5841. }
  5842. cur = ggml_add(ctx0, cur, ffn_inp);
  5843. cur = build_cvec(cur, il);
  5844. cb(cur, "l_out", il);
  5845. // input for next layer
  5846. inpL = cur;
  5847. }
  5848. cur = inpL;
  5849. cur = build_norm(cur,
  5850. model.output_norm, NULL,
  5851. LLM_NORM_RMS, -1);
  5852. cb(cur, "result_norm", -1);
  5853. res->t_embd = cur;
  5854. // lm_head
  5855. cur = build_lora_mm(model.output, cur);
  5856. cb(cur, "result_output", -1);
  5857. res->t_logits = cur;
  5858. ggml_build_forward_expand(gf, cur);
  5859. }
  5860. };
  5861. struct llm_build_qwen3 : public llm_graph_context {
  5862. llm_build_qwen3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  5863. const int64_t n_embd_head = hparams.n_embd_head_v;
  5864. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5865. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5866. ggml_tensor * cur;
  5867. ggml_tensor * inpL;
  5868. inpL = build_inp_embd(model.tok_embd);
  5869. // inp_pos - contains the positions
  5870. ggml_tensor * inp_pos = build_inp_pos();
  5871. auto * inp_attn = build_attn_inp_kv_unified();
  5872. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5873. for (int il = 0; il < n_layer; ++il) {
  5874. ggml_tensor * inpSA = inpL;
  5875. // norm
  5876. cur = build_norm(inpL,
  5877. model.layers[il].attn_norm, NULL,
  5878. LLM_NORM_RMS, il);
  5879. cb(cur, "attn_norm", il);
  5880. // self-attention
  5881. {
  5882. // compute Q and K and RoPE them
  5883. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5884. cb(Qcur, "Qcur", il);
  5885. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5886. cb(Kcur, "Kcur", il);
  5887. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5888. cb(Vcur, "Vcur", il);
  5889. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5890. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5891. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5892. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  5893. cb(Qcur, "Qcur_normed", il);
  5894. Qcur = ggml_rope_ext(
  5895. ctx0, Qcur, inp_pos, nullptr,
  5896. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5897. ext_factor, attn_factor, beta_fast, beta_slow
  5898. );
  5899. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  5900. cb(Kcur, "Kcur_normed", il);
  5901. Kcur = ggml_rope_ext(
  5902. ctx0, Kcur, inp_pos, nullptr,
  5903. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5904. ext_factor, attn_factor, beta_fast, beta_slow
  5905. );
  5906. cb(Qcur, "Qcur", il);
  5907. cb(Kcur, "Kcur", il);
  5908. cb(Vcur, "Vcur", il);
  5909. cur = build_attn(inp_attn, gf,
  5910. model.layers[il].wo, model.layers[il].bo,
  5911. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5912. }
  5913. if (il == n_layer - 1 && inp_out_ids) {
  5914. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5915. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5916. }
  5917. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5918. cb(ffn_inp, "ffn_inp", il);
  5919. // feed-forward network
  5920. cur = build_norm(ffn_inp,
  5921. model.layers[il].ffn_norm, NULL,
  5922. LLM_NORM_RMS, il);
  5923. cb(cur, "ffn_norm", il);
  5924. cur = build_ffn(cur,
  5925. model.layers[il].ffn_up, NULL, NULL,
  5926. model.layers[il].ffn_gate, NULL, NULL,
  5927. model.layers[il].ffn_down, NULL, NULL,
  5928. NULL,
  5929. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5930. cb(cur, "ffn_out", il);
  5931. cur = ggml_add(ctx0, cur, ffn_inp);
  5932. cur = build_cvec(cur, il);
  5933. cb(cur, "l_out", il);
  5934. // input for next layer
  5935. inpL = cur;
  5936. }
  5937. cur = inpL;
  5938. cur = build_norm(cur,
  5939. model.output_norm, NULL,
  5940. LLM_NORM_RMS, -1);
  5941. cb(cur, "result_norm", -1);
  5942. res->t_embd = cur;
  5943. // lm_head
  5944. cur = build_lora_mm(model.output, cur);
  5945. cb(cur, "result_output", -1);
  5946. res->t_logits = cur;
  5947. ggml_build_forward_expand(gf, cur);
  5948. }
  5949. };
  5950. struct llm_build_qwen3moe : public llm_graph_context {
  5951. llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  5952. const int64_t n_embd_head = hparams.n_embd_head_v;
  5953. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5954. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5955. ggml_tensor * cur;
  5956. ggml_tensor * inpL;
  5957. inpL = build_inp_embd(model.tok_embd);
  5958. // inp_pos - contains the positions
  5959. ggml_tensor * inp_pos = build_inp_pos();
  5960. auto * inp_attn = build_attn_inp_kv_unified();
  5961. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5962. for (int il = 0; il < n_layer; ++il) {
  5963. ggml_tensor * inpSA = inpL;
  5964. // norm
  5965. cur = build_norm(inpL,
  5966. model.layers[il].attn_norm, NULL,
  5967. LLM_NORM_RMS, il);
  5968. cb(cur, "attn_norm", il);
  5969. // self_attention
  5970. {
  5971. // compute Q and K and RoPE them
  5972. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5973. cb(Qcur, "Qcur", il);
  5974. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5975. cb(Kcur, "Kcur", il);
  5976. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5977. cb(Vcur, "Vcur", il);
  5978. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5979. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5980. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5981. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  5982. cb(Qcur, "Qcur_normed", il);
  5983. Qcur = ggml_rope_ext(
  5984. ctx0, Qcur, inp_pos, nullptr,
  5985. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5986. ext_factor, attn_factor, beta_fast, beta_slow
  5987. );
  5988. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  5989. cb(Kcur, "Kcur_normed", il);
  5990. Kcur = ggml_rope_ext(
  5991. ctx0, Kcur, inp_pos, nullptr,
  5992. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5993. ext_factor, attn_factor, beta_fast, beta_slow
  5994. );
  5995. cb(Qcur, "Qcur", il);
  5996. cb(Kcur, "Kcur", il);
  5997. cb(Vcur, "Vcur", il);
  5998. cur = build_attn(inp_attn, gf,
  5999. model.layers[il].wo, model.layers[il].bo,
  6000. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6001. }
  6002. if (il == n_layer - 1 && inp_out_ids) {
  6003. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6004. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6005. }
  6006. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6007. cb(ffn_inp, "ffn_inp", il);
  6008. // MoE branch
  6009. cur = build_norm(ffn_inp,
  6010. model.layers[il].ffn_norm, NULL,
  6011. LLM_NORM_RMS, il);
  6012. cb(cur, "ffn_norm", il);
  6013. ggml_tensor * moe_out =
  6014. build_moe_ffn(cur,
  6015. model.layers[il].ffn_gate_inp,
  6016. model.layers[il].ffn_up_exps,
  6017. model.layers[il].ffn_gate_exps,
  6018. model.layers[il].ffn_down_exps,
  6019. nullptr,
  6020. n_expert, n_expert_used,
  6021. LLM_FFN_SILU, true,
  6022. false, 0.0,
  6023. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  6024. il);
  6025. cb(moe_out, "ffn_moe_out", il);
  6026. cur = moe_out;
  6027. cur = ggml_add(ctx0, cur, ffn_inp);
  6028. cur = build_cvec(cur, il);
  6029. cb(cur, "l_out", il);
  6030. // input for next layer
  6031. inpL = cur;
  6032. }
  6033. cur = inpL;
  6034. cur = build_norm(cur,
  6035. model.output_norm, NULL,
  6036. LLM_NORM_RMS, -1);
  6037. cb(cur, "result_norm", -1);
  6038. res->t_embd = cur;
  6039. // lm_head
  6040. cur = build_lora_mm(model.output, cur);
  6041. cb(cur, "result_output", -1);
  6042. res->t_logits = cur;
  6043. ggml_build_forward_expand(gf, cur);
  6044. }
  6045. };
  6046. struct llm_build_phi2 : public llm_graph_context {
  6047. llm_build_phi2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  6048. const int64_t n_embd_head = hparams.n_embd_head_v;
  6049. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  6050. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6051. ggml_tensor * cur;
  6052. ggml_tensor * attn_norm_output;
  6053. ggml_tensor * ffn_output;
  6054. ggml_tensor * inpL;
  6055. inpL = build_inp_embd(model.tok_embd);
  6056. // inp_pos - contains the positions
  6057. ggml_tensor * inp_pos = build_inp_pos();
  6058. auto * inp_attn = build_attn_inp_kv_unified();
  6059. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6060. for (int il = 0; il < n_layer; ++il) {
  6061. attn_norm_output = build_norm(inpL,
  6062. model.layers[il].attn_norm,
  6063. model.layers[il].attn_norm_b,
  6064. LLM_NORM, il);
  6065. cb(attn_norm_output, "attn_norm", il);
  6066. // self-attention
  6067. {
  6068. ggml_tensor * Qcur = nullptr;
  6069. ggml_tensor * Kcur = nullptr;
  6070. ggml_tensor * Vcur = nullptr;
  6071. if (model.layers[il].wqkv) {
  6072. cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
  6073. cb(cur, "wqkv", il);
  6074. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  6075. cb(cur, "bqkv", il);
  6076. Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  6077. Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  6078. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  6079. } else {
  6080. Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
  6081. Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
  6082. Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
  6083. }
  6084. cb(Qcur, "Qcur", il);
  6085. cb(Kcur, "Kcur", il);
  6086. cb(Vcur, "Vcur", il);
  6087. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6088. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6089. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6090. Qcur = ggml_rope_ext(
  6091. ctx0, Qcur, inp_pos, nullptr,
  6092. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6093. ext_factor, attn_factor, beta_fast, beta_slow
  6094. );
  6095. Kcur = ggml_rope_ext(
  6096. ctx0, Kcur, inp_pos, nullptr,
  6097. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6098. ext_factor, attn_factor, beta_fast, beta_slow
  6099. );
  6100. cb(Qcur, "Qcur", il);
  6101. cb(Kcur, "Kcur", il);
  6102. cb(Vcur, "Vcur", il);
  6103. // with phi2, we scale the Q to avoid precision issues
  6104. // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
  6105. Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
  6106. cur = build_attn(inp_attn, gf,
  6107. model.layers[il].wo, model.layers[il].bo,
  6108. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
  6109. }
  6110. if (il == n_layer - 1 && inp_out_ids) {
  6111. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6112. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6113. attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
  6114. }
  6115. // FF
  6116. {
  6117. ffn_output = build_ffn(attn_norm_output,
  6118. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  6119. NULL, NULL, NULL,
  6120. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  6121. NULL,
  6122. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  6123. cb(ffn_output, "ffn_out", il);
  6124. }
  6125. cur = ggml_add(ctx0, cur, ffn_output);
  6126. cur = ggml_add(ctx0, cur, inpL);
  6127. cur = build_cvec(cur, il);
  6128. cb(cur, "l_out", il);
  6129. // input for next layer
  6130. inpL = cur;
  6131. }
  6132. cur = build_norm(inpL,
  6133. model.output_norm,
  6134. model.output_norm_b,
  6135. LLM_NORM, -1);
  6136. cb(cur, "result_norm", -1);
  6137. res->t_embd = cur;
  6138. cur = build_lora_mm(model.output, cur);
  6139. cb(cur, "result_output_no_bias", -1);
  6140. cur = ggml_add(ctx0, cur, model.output_b);
  6141. cb(cur, "result_output", -1);
  6142. res->t_logits = cur;
  6143. ggml_build_forward_expand(gf, cur);
  6144. }
  6145. };
  6146. template<bool iswa>
  6147. struct llm_build_phi3 : public llm_graph_context {
  6148. llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  6149. const int64_t n_embd_head = hparams.n_embd_head_v;
  6150. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  6151. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6152. ggml_tensor * cur;
  6153. ggml_tensor * inpL;
  6154. inpL = build_inp_embd(model.tok_embd);
  6155. // inp_pos - contains the positions
  6156. ggml_tensor * inp_pos = build_inp_pos();
  6157. using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
  6158. inp_attn_type * inp_attn = nullptr;
  6159. if constexpr (iswa) {
  6160. inp_attn = build_attn_inp_kv_unified_iswa();
  6161. } else {
  6162. inp_attn = build_attn_inp_kv_unified();
  6163. }
  6164. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6165. for (int il = 0; il < n_layer; ++il) {
  6166. auto * residual = inpL;
  6167. // self-attention
  6168. {
  6169. // rope freq factors for 128k context
  6170. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  6171. ggml_tensor* attn_norm_output = build_norm(inpL,
  6172. model.layers[il].attn_norm,
  6173. model.layers[il].attn_norm_b,
  6174. LLM_NORM_RMS, il);
  6175. cb(attn_norm_output, "attn_norm", il);
  6176. ggml_tensor * Qcur = nullptr;
  6177. ggml_tensor * Kcur = nullptr;
  6178. ggml_tensor * Vcur = nullptr;
  6179. if (model.layers[il].wqkv) {
  6180. cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
  6181. cb(cur, "wqkv", il);
  6182. Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
  6183. Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
  6184. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
  6185. } else {
  6186. Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
  6187. Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
  6188. Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
  6189. }
  6190. cb(Qcur, "Qcur", il);
  6191. cb(Kcur, "Kcur", il);
  6192. cb(Vcur, "Vcur", il);
  6193. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6194. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6195. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6196. Qcur = ggml_rope_ext(
  6197. ctx0, Qcur, inp_pos, rope_factors,
  6198. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6199. ext_factor, attn_factor, beta_fast, beta_slow
  6200. );
  6201. Kcur = ggml_rope_ext(
  6202. ctx0, Kcur, inp_pos, rope_factors,
  6203. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6204. ext_factor, attn_factor, beta_fast, beta_slow
  6205. );
  6206. cb(Qcur, "Qcur", il);
  6207. cb(Kcur, "Kcur", il);
  6208. cb(Vcur, "Vcur", il);
  6209. Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
  6210. cb(Qcur, "Qcur", il);
  6211. cur = build_attn(inp_attn, gf,
  6212. model.layers[il].wo, model.layers[il].bo,
  6213. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
  6214. }
  6215. if (il == n_layer - 1 && inp_out_ids) {
  6216. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6217. residual = ggml_get_rows(ctx0, residual, inp_out_ids);
  6218. }
  6219. cur = ggml_add(ctx0, cur, residual);
  6220. residual = cur;
  6221. cur = build_norm(cur,
  6222. model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
  6223. LLM_NORM_RMS, il);
  6224. cb(cur, "ffn_norm", il);
  6225. // feed-forward network
  6226. if (model.layers[il].ffn_gate_inp == nullptr) {
  6227. cur = build_ffn(cur,
  6228. model.layers[il].ffn_up, NULL, NULL,
  6229. NULL, NULL, NULL,
  6230. model.layers[il].ffn_down, NULL, NULL,
  6231. NULL,
  6232. LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
  6233. cb(cur, "ffn_out", il);
  6234. } else {
  6235. // MoE branch
  6236. cur = build_moe_ffn(cur,
  6237. model.layers[il].ffn_gate_inp,
  6238. model.layers[il].ffn_up_exps,
  6239. model.layers[il].ffn_gate_exps,
  6240. model.layers[il].ffn_down_exps,
  6241. nullptr,
  6242. n_expert, n_expert_used,
  6243. LLM_FFN_SILU, true,
  6244. false, 0.0,
  6245. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  6246. il);
  6247. cb(cur, "ffn_moe_out", il);
  6248. }
  6249. cur = ggml_add(ctx0, residual, cur);
  6250. cur = build_cvec(cur, il);
  6251. cb(cur, "l_out", il);
  6252. // input for next layer
  6253. inpL = cur;
  6254. }
  6255. cur = build_norm(inpL,
  6256. model.output_norm,
  6257. model.output_norm_b,
  6258. LLM_NORM_RMS, -1);
  6259. cb(cur, "result_norm", -1);
  6260. res->t_embd = cur;
  6261. cur = build_lora_mm(model.output, cur);
  6262. if (model.output_b != nullptr) {
  6263. cb(cur, "result_output_no_bias", -1);
  6264. cur = ggml_add(ctx0, cur, model.output_b);
  6265. }
  6266. cb(cur, "result_output", -1);
  6267. res->t_logits = cur;
  6268. ggml_build_forward_expand(gf, cur);
  6269. }
  6270. };
  6271. struct llm_build_plamo : public llm_graph_context {
  6272. llm_build_plamo(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  6273. const int64_t n_embd_head = hparams.n_embd_head_v;
  6274. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6275. GGML_ASSERT(n_embd_head == hparams.n_rot);
  6276. ggml_tensor * cur;
  6277. ggml_tensor * inpL;
  6278. inpL = build_inp_embd(model.tok_embd);
  6279. // inp_pos - contains the positions
  6280. ggml_tensor * inp_pos = build_inp_pos();
  6281. auto * inp_attn = build_attn_inp_kv_unified();
  6282. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6283. for (int il = 0; il < n_layer; ++il) {
  6284. // norm
  6285. cur = build_norm(inpL,
  6286. model.layers[il].attn_norm, NULL,
  6287. LLM_NORM_RMS, il);
  6288. cb(cur, "attn_norm", il);
  6289. ggml_tensor * sa_inp = cur;
  6290. // self-attention
  6291. {
  6292. // compute Q and K and RoPE them
  6293. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6294. cb(Qcur, "Qcur", il);
  6295. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6296. cb(Kcur, "Kcur", il);
  6297. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6298. cb(Vcur, "Vcur", il);
  6299. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6300. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6301. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6302. Qcur = ggml_rope_ext(
  6303. ctx0, Qcur, inp_pos, nullptr,
  6304. n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
  6305. ext_factor, attn_factor, beta_fast, beta_slow
  6306. );
  6307. Kcur = ggml_rope_ext(
  6308. ctx0, Kcur, inp_pos, nullptr,
  6309. n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
  6310. ext_factor, attn_factor, beta_fast, beta_slow
  6311. );
  6312. cb(Qcur, "Qcur", il);
  6313. cb(Kcur, "Kcur", il);
  6314. cb(Vcur, "Vcur", il);
  6315. cur = build_attn(inp_attn, gf,
  6316. model.layers[il].wo, NULL,
  6317. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6318. }
  6319. if (il == n_layer - 1 && inp_out_ids) {
  6320. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6321. sa_inp = ggml_get_rows(ctx0, sa_inp, inp_out_ids);
  6322. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6323. }
  6324. ggml_tensor * sa_out = cur;
  6325. cur = sa_inp;
  6326. // feed-forward network
  6327. {
  6328. cur = build_ffn(cur,
  6329. model.layers[il].ffn_up, NULL, NULL,
  6330. model.layers[il].ffn_gate, NULL, NULL,
  6331. model.layers[il].ffn_down, NULL, NULL,
  6332. NULL,
  6333. LLM_FFN_SILU, LLM_FFN_PAR, il);
  6334. cb(cur, "ffn_out", il);
  6335. }
  6336. cur = ggml_add(ctx0, cur, sa_out);
  6337. cur = ggml_add(ctx0, cur, inpL);
  6338. cur = build_cvec(cur, il);
  6339. cb(cur, "l_out", il);
  6340. // input for next layer
  6341. inpL = cur;
  6342. }
  6343. cur = inpL;
  6344. cur = build_norm(cur,
  6345. model.output_norm, NULL,
  6346. LLM_NORM_RMS, -1);
  6347. cb(cur, "result_norm", -1);
  6348. res->t_embd = cur;
  6349. // lm_head
  6350. cur = build_lora_mm(model.output, cur);
  6351. cb(cur, "result_output", -1);
  6352. res->t_logits = cur;
  6353. ggml_build_forward_expand(gf, cur);
  6354. }
  6355. };
  6356. struct llm_build_gpt2 : public llm_graph_context {
  6357. llm_build_gpt2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  6358. const int64_t n_embd_head = hparams.n_embd_head_v;
  6359. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  6360. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6361. ggml_tensor * cur;
  6362. ggml_tensor * pos;
  6363. ggml_tensor * inpL;
  6364. inpL = build_inp_embd(model.tok_embd);
  6365. // inp_pos - contains the positions
  6366. ggml_tensor * inp_pos = build_inp_pos();
  6367. auto * inp_attn = build_attn_inp_kv_unified();
  6368. pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
  6369. cb(pos, "pos_embd", -1);
  6370. inpL = ggml_add(ctx0, inpL, pos);
  6371. cb(inpL, "inpL", -1);
  6372. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6373. for (int il = 0; il < n_layer; ++il) {
  6374. cur = build_norm(inpL,
  6375. model.layers[il].attn_norm,
  6376. model.layers[il].attn_norm_b,
  6377. LLM_NORM, il);
  6378. cb(cur, "attn_norm", il);
  6379. // self-attention
  6380. {
  6381. cur = build_lora_mm(model.layers[il].wqkv, cur);
  6382. cb(cur, "wqkv", il);
  6383. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  6384. cb(cur, "bqkv", il);
  6385. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  6386. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  6387. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  6388. cb(Qcur, "Qcur", il);
  6389. cb(Kcur, "Kcur", il);
  6390. cb(Vcur, "Vcur", il);
  6391. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6392. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6393. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6394. cur = build_attn(inp_attn, gf,
  6395. model.layers[il].wo, model.layers[il].bo,
  6396. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6397. }
  6398. if (il == n_layer - 1 && inp_out_ids) {
  6399. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6400. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6401. }
  6402. // add the input
  6403. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  6404. cb(ffn_inp, "ffn_inp", il);
  6405. // FF
  6406. {
  6407. cur = build_norm(ffn_inp,
  6408. model.layers[il].ffn_norm,
  6409. model.layers[il].ffn_norm_b,
  6410. LLM_NORM, il);
  6411. cb(cur, "ffn_norm", il);
  6412. cur = build_ffn(cur,
  6413. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  6414. NULL, NULL, NULL,
  6415. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  6416. NULL,
  6417. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  6418. cb(cur, "ffn_out", il);
  6419. }
  6420. cur = ggml_add(ctx0, cur, ffn_inp);
  6421. cur = build_cvec(cur, il);
  6422. cb(cur, "l_out", il);
  6423. // input for next layer
  6424. inpL = cur;
  6425. }
  6426. cur = build_norm(inpL,
  6427. model.output_norm,
  6428. model.output_norm_b,
  6429. LLM_NORM, -1);
  6430. cb(cur, "result_norm", -1);
  6431. res->t_embd = cur;
  6432. cur = build_lora_mm(model.output, cur);
  6433. cb(cur, "result_output", -1);
  6434. res->t_logits = cur;
  6435. ggml_build_forward_expand(gf, cur);
  6436. }
  6437. };
  6438. struct llm_build_codeshell : public llm_graph_context {
  6439. llm_build_codeshell(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  6440. const int64_t n_embd_head = hparams.n_embd_head_v;
  6441. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  6442. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6443. GGML_ASSERT(n_embd_head == hparams.n_rot);
  6444. ggml_tensor * cur;
  6445. ggml_tensor * inpL;
  6446. inpL = build_inp_embd(model.tok_embd);
  6447. // inp_pos - contains the positions
  6448. ggml_tensor * inp_pos = build_inp_pos();
  6449. auto * inp_attn = build_attn_inp_kv_unified();
  6450. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6451. for (int il = 0; il < n_layer; ++il) {
  6452. cur = build_norm(inpL,
  6453. model.layers[il].attn_norm,
  6454. model.layers[il].attn_norm_b,
  6455. LLM_NORM, il);
  6456. cb(cur, "attn_norm", il);
  6457. // self-attention
  6458. {
  6459. cur = build_lora_mm(model.layers[il].wqkv, cur);
  6460. cb(cur, "wqkv", il);
  6461. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  6462. cb(cur, "bqkv", il);
  6463. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  6464. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  6465. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  6466. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6467. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6468. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6469. Qcur = ggml_rope_ext(
  6470. ctx0, Qcur, inp_pos, nullptr,
  6471. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6472. ext_factor, attn_factor, beta_fast, beta_slow
  6473. );
  6474. Kcur = ggml_rope_ext(
  6475. ctx0, Kcur, inp_pos, nullptr,
  6476. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6477. ext_factor, attn_factor, beta_fast, beta_slow
  6478. );
  6479. cb(Qcur, "Qcur", il);
  6480. cb(Kcur, "Kcur", il);
  6481. cb(Vcur, "Vcur", il);
  6482. cur = build_attn(inp_attn, gf,
  6483. model.layers[il].wo, model.layers[il].bo,
  6484. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6485. }
  6486. if (il == n_layer - 1 && inp_out_ids) {
  6487. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6488. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6489. }
  6490. // add the input
  6491. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  6492. cb(ffn_inp, "ffn_inp", il);
  6493. // FF
  6494. {
  6495. cur = build_norm(ffn_inp,
  6496. model.layers[il].ffn_norm,
  6497. model.layers[il].ffn_norm_b,
  6498. LLM_NORM, il);
  6499. cb(cur, "ffn_norm", il);
  6500. cur = build_ffn(cur,
  6501. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  6502. NULL, NULL, NULL,
  6503. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  6504. NULL,
  6505. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  6506. cb(cur, "ffn_out", il);
  6507. }
  6508. cur = ggml_add(ctx0, cur, ffn_inp);
  6509. cur = build_cvec(cur, il);
  6510. cb(cur, "l_out", il);
  6511. // input for next layer
  6512. inpL = cur;
  6513. }
  6514. cur = build_norm(inpL,
  6515. model.output_norm,
  6516. model.output_norm_b,
  6517. LLM_NORM, -1);
  6518. cb(cur, "result_norm", -1);
  6519. res->t_embd = cur;
  6520. cur = build_lora_mm(model.output, cur);
  6521. cb(cur, "result_output", -1);
  6522. res->t_logits = cur;
  6523. ggml_build_forward_expand(gf, cur);
  6524. }
  6525. };
  6526. struct llm_build_orion : public llm_graph_context {
  6527. llm_build_orion(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  6528. const int64_t n_embd_head = hparams.n_embd_head_v;
  6529. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6530. GGML_ASSERT(n_embd_head == hparams.n_rot);
  6531. ggml_tensor * cur;
  6532. ggml_tensor * inpL;
  6533. inpL = build_inp_embd(model.tok_embd);
  6534. // inp_pos - contains the positions
  6535. ggml_tensor * inp_pos = build_inp_pos();
  6536. auto * inp_attn = build_attn_inp_kv_unified();
  6537. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6538. for (int il = 0; il < n_layer; ++il) {
  6539. ggml_tensor * inpSA = inpL;
  6540. // norm
  6541. cur = build_norm(inpL,
  6542. model.layers[il].attn_norm, model.layers[il].attn_norm_b,
  6543. LLM_NORM, il);
  6544. cb(cur, "attn_norm", il);
  6545. // self-attention
  6546. {
  6547. // compute Q and K and RoPE them
  6548. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6549. cb(Qcur, "Qcur", il);
  6550. // if (model.layers[il].bq) {
  6551. // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  6552. // cb(Qcur, "Qcur", il);
  6553. // }
  6554. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6555. cb(Kcur, "Kcur", il);
  6556. // if (model.layers[il].bk) {
  6557. // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  6558. // cb(Kcur, "Kcur", il);
  6559. // }
  6560. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6561. cb(Vcur, "Vcur", il);
  6562. // if (model.layers[il].bv) {
  6563. // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  6564. // cb(Vcur, "Vcur", il);
  6565. // }
  6566. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6567. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6568. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6569. Qcur = ggml_rope_ext(
  6570. ctx0, Qcur, inp_pos, nullptr,
  6571. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6572. ext_factor, attn_factor, beta_fast, beta_slow
  6573. );
  6574. Kcur = ggml_rope_ext(
  6575. ctx0, Kcur, inp_pos, nullptr,
  6576. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6577. ext_factor, attn_factor, beta_fast, beta_slow
  6578. );
  6579. cb(Qcur, "Qcur", il);
  6580. cb(Kcur, "Kcur", il);
  6581. cb(Vcur, "Vcur", il);
  6582. cur = build_attn(inp_attn, gf,
  6583. model.layers[il].wo, NULL,
  6584. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6585. }
  6586. if (il == n_layer - 1 && inp_out_ids) {
  6587. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6588. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6589. }
  6590. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6591. cb(ffn_inp, "ffn_inp", il);
  6592. // feed-forward network
  6593. cur = build_norm(ffn_inp,
  6594. model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
  6595. LLM_NORM, il);
  6596. cb(cur, "ffn_norm", il);
  6597. cur = build_ffn(cur,
  6598. model.layers[il].ffn_up, NULL, NULL,
  6599. model.layers[il].ffn_gate, NULL, NULL,
  6600. model.layers[il].ffn_down, NULL, NULL,
  6601. NULL,
  6602. LLM_FFN_SILU, LLM_FFN_PAR, il);
  6603. cb(cur, "ffn_out", il);
  6604. cur = ggml_add(ctx0, cur, ffn_inp);
  6605. cur = build_cvec(cur, il);
  6606. cb(cur, "l_out", il);
  6607. // input for next layer
  6608. inpL = cur;
  6609. }
  6610. cur = inpL;
  6611. cur = build_norm(cur,
  6612. model.output_norm, model.output_norm_b,
  6613. LLM_NORM, -1);
  6614. cb(cur, "result_norm", -1);
  6615. res->t_embd = cur;
  6616. // lm_head
  6617. cur = build_lora_mm(model.output, cur);
  6618. cb(cur, "result_output", -1);
  6619. res->t_logits = cur;
  6620. ggml_build_forward_expand(gf, cur);
  6621. }
  6622. };
  6623. struct llm_build_internlm2 : public llm_graph_context {
  6624. llm_build_internlm2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  6625. const int64_t n_embd_head = hparams.n_embd_head_v;
  6626. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6627. GGML_ASSERT(n_embd_head == hparams.n_rot);
  6628. ggml_tensor * cur;
  6629. ggml_tensor * inpL;
  6630. inpL = build_inp_embd(model.tok_embd);
  6631. // inp_pos - contains the positions
  6632. ggml_tensor * inp_pos = build_inp_pos();
  6633. auto * inp_attn = build_attn_inp_kv_unified();
  6634. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6635. for (int il = 0; il < n_layer; ++il) {
  6636. ggml_tensor * inpSA = inpL;
  6637. // norm
  6638. cur = build_norm(inpL,
  6639. model.layers[il].attn_norm, NULL,
  6640. LLM_NORM_RMS, il);
  6641. cb(cur, "attn_norm", il);
  6642. // self-attention
  6643. {
  6644. // compute Q and K and RoPE them
  6645. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6646. cb(Qcur, "Qcur", il);
  6647. if (model.layers[il].bq) {
  6648. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  6649. cb(Qcur, "Qcur", il);
  6650. }
  6651. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6652. cb(Kcur, "Kcur", il);
  6653. if (model.layers[il].bk) {
  6654. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  6655. cb(Kcur, "Kcur", il);
  6656. }
  6657. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6658. cb(Vcur, "Vcur", il);
  6659. if (model.layers[il].bv) {
  6660. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  6661. cb(Vcur, "Vcur", il);
  6662. }
  6663. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6664. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6665. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6666. Qcur = ggml_rope_ext(
  6667. ctx0, Qcur, inp_pos, nullptr,
  6668. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6669. ext_factor, attn_factor, beta_fast, beta_slow
  6670. );
  6671. Kcur = ggml_rope_ext(
  6672. ctx0, Kcur, inp_pos, nullptr,
  6673. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6674. ext_factor, attn_factor, beta_fast, beta_slow
  6675. );
  6676. cb(Qcur, "Qcur", il);
  6677. cb(Kcur, "Kcur", il);
  6678. cb(Vcur, "Vcur", il);
  6679. cur = build_attn(inp_attn, gf,
  6680. model.layers[il].wo, model.layers[il].bo,
  6681. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6682. }
  6683. if (il == n_layer - 1 && inp_out_ids) {
  6684. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6685. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6686. }
  6687. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6688. cb(ffn_inp, "ffn_inp", il);
  6689. // feed-forward network
  6690. cur = build_norm(ffn_inp,
  6691. model.layers[il].ffn_norm, NULL,
  6692. LLM_NORM_RMS, il);
  6693. cb(cur, "ffn_norm", il);
  6694. cur = build_ffn(cur,
  6695. model.layers[il].ffn_up, NULL, NULL,
  6696. model.layers[il].ffn_gate, NULL, NULL,
  6697. model.layers[il].ffn_down, NULL, NULL,
  6698. NULL,
  6699. LLM_FFN_SILU, LLM_FFN_PAR, il);
  6700. cb(cur, "ffn_out", il);
  6701. cur = ggml_add(ctx0, cur, ffn_inp);
  6702. cur = build_cvec(cur, il);
  6703. cb(cur, "l_out", il);
  6704. // input for next layer
  6705. inpL = cur;
  6706. }
  6707. cur = inpL;
  6708. cur = build_norm(cur,
  6709. model.output_norm, NULL,
  6710. LLM_NORM_RMS, -1);
  6711. cb(cur, "result_norm", -1);
  6712. res->t_embd = cur;
  6713. // lm_head
  6714. cur = build_lora_mm(model.output, cur);
  6715. cb(cur, "result_output", -1);
  6716. res->t_logits = cur;
  6717. ggml_build_forward_expand(gf, cur);
  6718. }
  6719. };
  6720. struct llm_build_minicpm3 : public llm_graph_context {
  6721. llm_build_minicpm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  6722. //TODO: if the model varies, these parameters need to be read from the model
  6723. const int64_t n_embd_base = 256;
  6724. const float scale_embd = 12.0f;
  6725. const float scale_depth = 1.4f;
  6726. const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
  6727. const uint32_t n_embd_head_qk_rope = hparams.n_rot;
  6728. const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  6729. const uint32_t kv_lora_rank = hparams.n_lora_kv;
  6730. ggml_tensor * cur;
  6731. ggml_tensor * inpL;
  6732. inpL = build_inp_embd(model.tok_embd);
  6733. // scale the input embeddings
  6734. inpL = ggml_scale(ctx0, inpL, scale_embd);
  6735. cb(inpL, "inp_scaled", -1);
  6736. // inp_pos - contains the positions
  6737. ggml_tensor * inp_pos = build_inp_pos();
  6738. auto * inp_attn = build_attn_inp_kv_unified();
  6739. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6740. for (int il = 0; il < n_layer; ++il) {
  6741. ggml_tensor * inpSA = inpL;
  6742. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  6743. // norm
  6744. cur = build_norm(inpL,
  6745. model.layers[il].attn_norm, NULL,
  6746. LLM_NORM_RMS, il);
  6747. cb(cur, "attn_norm", il);
  6748. // self_attention
  6749. {
  6750. ggml_tensor * q = NULL;
  6751. // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
  6752. q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
  6753. cb(q, "q", il);
  6754. q = build_norm(q,
  6755. model.layers[il].attn_q_a_norm, NULL,
  6756. LLM_NORM_RMS, il);
  6757. cb(q, "q", il);
  6758. // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
  6759. q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
  6760. cb(q, "q", il);
  6761. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  6762. ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
  6763. ggml_row_size(q->type, hparams.n_embd_head_k),
  6764. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  6765. 0);
  6766. cb(q_nope, "q_nope", il);
  6767. // and {n_head * n_embd_head_qk_rope, n_tokens}
  6768. ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
  6769. ggml_row_size(q->type, hparams.n_embd_head_k),
  6770. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  6771. ggml_row_size(q->type, n_embd_head_qk_nope));
  6772. cb(q_pe, "q_pe", il);
  6773. // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
  6774. ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
  6775. cb(kv_pe_compresseed, "kv_pe_compresseed", il);
  6776. // split into {kv_lora_rank, n_tokens}
  6777. ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
  6778. kv_pe_compresseed->nb[1],
  6779. 0);
  6780. cb(kv_compressed, "kv_compressed", il);
  6781. // and {n_embd_head_qk_rope, n_tokens}
  6782. ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
  6783. kv_pe_compresseed->nb[1],
  6784. kv_pe_compresseed->nb[1],
  6785. ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
  6786. cb(k_pe, "k_pe", il);
  6787. // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
  6788. kv_compressed = ggml_cont(ctx0, kv_compressed);
  6789. kv_compressed = build_norm(kv_compressed,
  6790. model.layers[il].attn_kv_a_norm, NULL,
  6791. LLM_NORM_RMS, il);
  6792. cb(kv_compressed, "kv_compressed", il);
  6793. // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
  6794. ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
  6795. cb(kv, "kv", il);
  6796. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  6797. ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
  6798. ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
  6799. ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  6800. 0);
  6801. cb(k_nope, "k_nope", il);
  6802. // and {n_head * n_embd_head_v, n_tokens}
  6803. ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
  6804. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  6805. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
  6806. ggml_row_size(kv->type, (n_embd_head_qk_nope)));
  6807. cb(v_states, "v_states", il);
  6808. v_states = ggml_cont(ctx0, v_states);
  6809. cb(v_states, "v_states", il);
  6810. v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
  6811. ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
  6812. 0);
  6813. cb(v_states, "v_states", il);
  6814. q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
  6815. q_pe = ggml_rope_ext(
  6816. ctx0, q_pe, inp_pos, rope_factors,
  6817. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6818. ext_factor, attn_factor, beta_fast, beta_slow
  6819. );
  6820. cb(q_pe, "q_pe", il);
  6821. // shared RoPE key
  6822. k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
  6823. k_pe = ggml_rope_ext(
  6824. ctx0, k_pe, inp_pos, rope_factors,
  6825. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6826. ext_factor, attn_factor, beta_fast, beta_slow
  6827. );
  6828. cb(k_pe, "k_pe", il);
  6829. ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
  6830. cb(q_states, "q_states", il);
  6831. ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
  6832. cb(k_states, "k_states", il);
  6833. cur = build_attn(inp_attn, gf,
  6834. model.layers[il].wo, NULL,
  6835. q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
  6836. }
  6837. if (il == n_layer - 1 && inp_out_ids) {
  6838. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6839. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6840. }
  6841. // scale_res - scale the hidden states for residual connection
  6842. const float scale_res = scale_depth/sqrtf(float(n_layer)); // TODO: is this correct?
  6843. cur = ggml_scale(ctx0, cur, scale_res);
  6844. cb(cur, "hidden_scaled", il);
  6845. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6846. cb(ffn_inp, "ffn_inp", il);
  6847. // feed-forward network
  6848. {
  6849. cur = build_norm(ffn_inp,
  6850. model.layers[il].ffn_norm, NULL,
  6851. LLM_NORM_RMS, il);
  6852. cb(cur, "ffn_norm", il);
  6853. cur = build_ffn(cur,
  6854. model.layers[il].ffn_up, NULL, NULL,
  6855. model.layers[il].ffn_gate, NULL, NULL,
  6856. model.layers[il].ffn_down, NULL, NULL,
  6857. NULL,
  6858. LLM_FFN_SILU, LLM_FFN_PAR, il);
  6859. cb(cur, "ffn_out", il);
  6860. }
  6861. // scale the hidden states for residual connection
  6862. cur = ggml_scale(ctx0, cur, scale_res);
  6863. cb(cur, "hidden_scaled_ffn", il);
  6864. cur = ggml_add(ctx0, cur, ffn_inp);
  6865. cur = build_cvec(cur, il);
  6866. cb(cur, "l_out", il);
  6867. // input for next layer
  6868. inpL = cur;
  6869. }
  6870. cur = inpL;
  6871. cur = build_norm(cur,
  6872. model.output_norm, NULL,
  6873. LLM_NORM_RMS, -1);
  6874. cb(cur, "result_norm", -1);
  6875. res->t_embd = cur;
  6876. // lm_head scaling
  6877. const float scale_lmhead = float(n_embd_base)/float(n_embd);
  6878. cur = ggml_scale(ctx0, cur, scale_lmhead);
  6879. cb(cur, "lmhead_scaling", -1);
  6880. // lm_head
  6881. cur = build_lora_mm(model.output, cur);
  6882. cb(cur, "result_output", -1);
  6883. res->t_logits = cur;
  6884. ggml_build_forward_expand(gf, cur);
  6885. }
  6886. };
  6887. struct llm_build_gemma : public llm_graph_context {
  6888. llm_build_gemma(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  6889. const int64_t n_embd_head = hparams.n_embd_head_v;
  6890. ggml_tensor * cur;
  6891. ggml_tensor * inpL;
  6892. inpL = build_inp_embd(model.tok_embd);
  6893. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  6894. cb(inpL, "inp_scaled", -1);
  6895. // inp_pos - contains the positions
  6896. ggml_tensor * inp_pos = build_inp_pos();
  6897. auto * inp_attn = build_attn_inp_kv_unified();
  6898. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6899. for (int il = 0; il < n_layer; ++il) {
  6900. // norm
  6901. cur = build_norm(inpL,
  6902. model.layers[il].attn_norm, NULL,
  6903. LLM_NORM_RMS, il);
  6904. cb(cur, "attn_norm", il);
  6905. // self-attention
  6906. {
  6907. // compute Q and K and RoPE them
  6908. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6909. cb(Qcur, "Qcur", il);
  6910. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6911. cb(Kcur, "Kcur", il);
  6912. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6913. cb(Vcur, "Vcur", il);
  6914. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6915. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6916. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6917. Qcur = ggml_rope_ext(
  6918. ctx0, Qcur, inp_pos, nullptr,
  6919. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6920. ext_factor, attn_factor, beta_fast, beta_slow);
  6921. Kcur = ggml_rope_ext(
  6922. ctx0, Kcur, inp_pos, nullptr,
  6923. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6924. ext_factor, attn_factor, beta_fast, beta_slow);
  6925. cb(Qcur, "Qcur", il);
  6926. cb(Kcur, "Kcur", il);
  6927. cb(Vcur, "Vcur", il);
  6928. Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
  6929. cb(Qcur, "Qcur_scaled", il);
  6930. cur = build_attn(inp_attn, gf,
  6931. model.layers[il].wo, NULL,
  6932. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
  6933. }
  6934. if (il == n_layer - 1 && inp_out_ids) {
  6935. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6936. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6937. }
  6938. ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
  6939. cb(sa_out, "sa_out", il);
  6940. cur = build_norm(sa_out,
  6941. model.layers[il].ffn_norm, NULL,
  6942. LLM_NORM_RMS, il);
  6943. cb(cur, "ffn_norm", il);
  6944. // feed-forward network
  6945. {
  6946. cur = build_ffn(cur,
  6947. model.layers[il].ffn_up, NULL, NULL,
  6948. model.layers[il].ffn_gate, NULL, NULL,
  6949. model.layers[il].ffn_down, NULL, NULL,
  6950. NULL,
  6951. LLM_FFN_GELU, LLM_FFN_PAR, il);
  6952. cb(cur, "ffn_out", il);
  6953. }
  6954. cur = ggml_add(ctx0, cur, sa_out);
  6955. cur = build_cvec(cur, il);
  6956. cb(cur, "l_out", il);
  6957. // input for next layer
  6958. inpL = cur;
  6959. }
  6960. cur = inpL;
  6961. cur = build_norm(cur,
  6962. model.output_norm, NULL,
  6963. LLM_NORM_RMS, -1);
  6964. cb(cur, "result_norm", -1);
  6965. res->t_embd = cur;
  6966. // lm_head
  6967. cur = build_lora_mm(model.output, cur);
  6968. cb(cur, "result_output", -1);
  6969. res->t_logits = cur;
  6970. ggml_build_forward_expand(gf, cur);
  6971. }
  6972. };
  6973. struct llm_build_gemma2_iswa : public llm_graph_context {
  6974. llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  6975. const int64_t n_embd_head = hparams.n_embd_head_k;
  6976. ggml_tensor * cur;
  6977. ggml_tensor * inpL;
  6978. inpL = build_inp_embd(model.tok_embd);
  6979. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  6980. cb(inpL, "inp_scaled", -1);
  6981. // inp_pos - contains the positions
  6982. ggml_tensor * inp_pos = build_inp_pos();
  6983. auto * inp_attn = build_attn_inp_kv_unified_iswa();
  6984. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6985. for (int il = 0; il < n_layer; ++il) {
  6986. // norm
  6987. cur = build_norm(inpL,
  6988. model.layers[il].attn_norm, NULL,
  6989. LLM_NORM_RMS, il);
  6990. cb(cur, "attn_norm", il);
  6991. // self-attention
  6992. {
  6993. // compute Q and K and RoPE them
  6994. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6995. cb(Qcur, "Qcur", il);
  6996. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6997. cb(Kcur, "Kcur", il);
  6998. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6999. cb(Vcur, "Vcur", il);
  7000. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7001. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7002. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7003. Qcur = ggml_rope_ext(
  7004. ctx0, Qcur, inp_pos, nullptr,
  7005. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7006. ext_factor, attn_factor, beta_fast, beta_slow);
  7007. Kcur = ggml_rope_ext(
  7008. ctx0, Kcur, inp_pos, nullptr,
  7009. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7010. ext_factor, attn_factor, beta_fast, beta_slow);
  7011. cb(Qcur, "Qcur", il);
  7012. cb(Kcur, "Kcur", il);
  7013. cb(Vcur, "Vcur", il);
  7014. Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
  7015. cur = build_attn(inp_attn, gf,
  7016. model.layers[il].wo, NULL,
  7017. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
  7018. }
  7019. if (il == n_layer - 1 && inp_out_ids) {
  7020. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7021. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  7022. }
  7023. cur = build_norm(cur,
  7024. model.layers[il].attn_post_norm, NULL,
  7025. LLM_NORM_RMS, il);
  7026. cb(cur, "attn_post_norm", il);
  7027. ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
  7028. cb(sa_out, "sa_out", il);
  7029. cur = build_norm(sa_out,
  7030. model.layers[il].ffn_norm, NULL,
  7031. LLM_NORM_RMS, il);
  7032. cb(cur, "ffn_norm", il);
  7033. // feed-forward network
  7034. {
  7035. cur = build_ffn(cur,
  7036. model.layers[il].ffn_up, NULL, NULL,
  7037. model.layers[il].ffn_gate, NULL, NULL,
  7038. model.layers[il].ffn_down, NULL, NULL,
  7039. NULL,
  7040. LLM_FFN_GELU, LLM_FFN_PAR, il);
  7041. cb(cur, "ffn_out", il);
  7042. }
  7043. cur = build_norm(cur,
  7044. model.layers[il].ffn_post_norm, NULL,
  7045. LLM_NORM_RMS, -1);
  7046. cb(cur, "ffn_post_norm", -1);
  7047. cur = ggml_add(ctx0, cur, sa_out);
  7048. cur = build_cvec(cur, il);
  7049. cb(cur, "l_out", il);
  7050. // input for next layer
  7051. inpL = cur;
  7052. }
  7053. cur = inpL;
  7054. cur = build_norm(cur,
  7055. model.output_norm, NULL,
  7056. LLM_NORM_RMS, -1);
  7057. cb(cur, "result_norm", -1);
  7058. res->t_embd = cur;
  7059. // lm_head
  7060. cur = build_lora_mm(model.output, cur);
  7061. // final logit soft-capping
  7062. cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
  7063. cur = ggml_tanh(ctx0, cur);
  7064. cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
  7065. cb(cur, "result_output", -1);
  7066. res->t_logits = cur;
  7067. ggml_build_forward_expand(gf, cur);
  7068. }
  7069. };
  7070. struct llm_build_gemma3_iswa : public llm_graph_context {
  7071. llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  7072. const int64_t n_embd_head = hparams.n_embd_head_k;
  7073. ggml_tensor * cur;
  7074. ggml_tensor * inpL;
  7075. inpL = build_inp_embd(model.tok_embd);
  7076. // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
  7077. if (ubatch.token) {
  7078. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  7079. cb(inpL, "inp_scaled", -1);
  7080. }
  7081. // inp_pos - contains the positions
  7082. ggml_tensor * inp_pos = build_inp_pos();
  7083. // TODO: is causal == true correct? might need some changes
  7084. auto * inp_attn = build_attn_inp_kv_unified_iswa();
  7085. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7086. for (int il = 0; il < n_layer; ++il) {
  7087. const float freq_base_l = model.get_rope_freq_base (cparams, il);
  7088. const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
  7089. // norm
  7090. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  7091. cb(cur, "attn_norm", il);
  7092. // self-attention
  7093. {
  7094. // compute Q and K and RoPE them
  7095. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7096. cb(Qcur, "Qcur", il);
  7097. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7098. cb(Kcur, "Kcur", il);
  7099. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7100. cb(Vcur, "Vcur", il);
  7101. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7102. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7103. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7104. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  7105. cb(Qcur, "Qcur_normed", il);
  7106. Qcur = ggml_rope_ext(
  7107. ctx0, Qcur, inp_pos, nullptr,
  7108. n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
  7109. ext_factor, attn_factor, beta_fast, beta_slow);
  7110. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  7111. cb(Kcur, "Kcur_normed", il);
  7112. Kcur = ggml_rope_ext(
  7113. ctx0, Kcur, inp_pos, nullptr,
  7114. n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
  7115. ext_factor, attn_factor, beta_fast, beta_slow);
  7116. cb(Qcur, "Qcur", il);
  7117. cb(Kcur, "Kcur", il);
  7118. cb(Vcur, "Vcur", il);
  7119. // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
  7120. Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
  7121. cur = build_attn(inp_attn, gf,
  7122. model.layers[il].wo, NULL,
  7123. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
  7124. }
  7125. if (il == n_layer - 1 && inp_out_ids) {
  7126. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7127. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  7128. }
  7129. cur = build_norm(cur,
  7130. model.layers[il].attn_post_norm, NULL,
  7131. LLM_NORM_RMS, il);
  7132. cb(cur, "attn_post_norm", il);
  7133. ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
  7134. cb(sa_out, "sa_out", il);
  7135. cur = build_norm(sa_out,
  7136. model.layers[il].ffn_norm, NULL,
  7137. LLM_NORM_RMS, il);
  7138. cb(cur, "ffn_norm", il);
  7139. // feed-forward network
  7140. {
  7141. cur = build_ffn(cur,
  7142. model.layers[il].ffn_up, NULL, NULL,
  7143. model.layers[il].ffn_gate, NULL, NULL,
  7144. model.layers[il].ffn_down, NULL, NULL,
  7145. NULL,
  7146. LLM_FFN_GELU, LLM_FFN_PAR, il);
  7147. cb(cur, "ffn_out", il);
  7148. }
  7149. cur = build_norm(cur,
  7150. model.layers[il].ffn_post_norm, NULL,
  7151. LLM_NORM_RMS, -1);
  7152. cb(cur, "ffn_post_norm", -1);
  7153. cur = ggml_add(ctx0, cur, sa_out);
  7154. cur = build_cvec(cur, il);
  7155. cb(cur, "l_out", il);
  7156. // input for next layer
  7157. inpL = cur;
  7158. }
  7159. cur = inpL;
  7160. cur = build_norm(cur,
  7161. model.output_norm, NULL,
  7162. LLM_NORM_RMS, -1);
  7163. cb(cur, "result_norm", -1);
  7164. res->t_embd = cur;
  7165. // lm_head
  7166. cur = build_lora_mm(model.output, cur);
  7167. cb(cur, "result_output", -1);
  7168. res->t_logits = cur;
  7169. ggml_build_forward_expand(gf, cur);
  7170. }
  7171. };
  7172. // TODO: move up next to build_starcoder
  7173. struct llm_build_starcoder2 : public llm_graph_context {
  7174. llm_build_starcoder2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  7175. const int64_t n_embd_head = hparams.n_embd_head_v;
  7176. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7177. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7178. ggml_tensor * cur;
  7179. ggml_tensor * inpL;
  7180. inpL = build_inp_embd(model.tok_embd);
  7181. // inp_pos - contains the positions
  7182. ggml_tensor * inp_pos = build_inp_pos();
  7183. auto * inp_attn = build_attn_inp_kv_unified();
  7184. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7185. for (int il = 0; il < n_layer; ++il) {
  7186. ggml_tensor * inpSA = inpL;
  7187. // norm
  7188. cur = build_norm(inpL,
  7189. model.layers[il].attn_norm, model.layers[il].attn_norm_b,
  7190. LLM_NORM, il);
  7191. cb(cur, "attn_norm", il);
  7192. // self-attention
  7193. {
  7194. // compute Q and K and RoPE them
  7195. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7196. cb(Qcur, "Qcur", il);
  7197. if (model.layers[il].bq) {
  7198. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  7199. cb(Qcur, "Qcur", il);
  7200. }
  7201. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7202. cb(Kcur, "Kcur", il);
  7203. if (model.layers[il].bk) {
  7204. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  7205. cb(Kcur, "Kcur", il);
  7206. }
  7207. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7208. cb(Vcur, "Vcur", il);
  7209. if (model.layers[il].bv) {
  7210. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  7211. cb(Vcur, "Vcur", il);
  7212. }
  7213. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7214. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7215. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7216. Qcur = ggml_rope_ext(
  7217. ctx0, Qcur, inp_pos, nullptr,
  7218. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7219. ext_factor, attn_factor, beta_fast, beta_slow
  7220. );
  7221. Kcur = ggml_rope_ext(
  7222. ctx0, Kcur, inp_pos, nullptr,
  7223. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7224. ext_factor, attn_factor, beta_fast, beta_slow
  7225. );
  7226. cb(Qcur, "Qcur", il);
  7227. cb(Kcur, "Kcur", il);
  7228. cb(Vcur, "Vcur", il);
  7229. cur = build_attn(inp_attn, gf,
  7230. model.layers[il].wo, model.layers[il].bo,
  7231. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7232. }
  7233. if (il == n_layer - 1 && inp_out_ids) {
  7234. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7235. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7236. }
  7237. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7238. cb(ffn_inp, "ffn_inp", il);
  7239. // feed-forward network
  7240. cur = build_norm(ffn_inp,
  7241. model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
  7242. LLM_NORM, il);
  7243. cb(cur, "ffn_norm", il);
  7244. cur = build_ffn(cur,
  7245. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  7246. NULL, NULL, NULL,
  7247. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  7248. NULL,
  7249. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  7250. cb(cur, "ffn_out", il);
  7251. cur = ggml_add(ctx0, cur, ffn_inp);
  7252. cur = build_cvec(cur, il);
  7253. cb(cur, "l_out", il);
  7254. // input for next layer
  7255. inpL = cur;
  7256. }
  7257. cur = inpL;
  7258. cur = build_norm(cur,
  7259. model.output_norm, model.output_norm_b,
  7260. LLM_NORM, -1);
  7261. cb(cur, "result_norm", -1);
  7262. res->t_embd = cur;
  7263. // lm_head
  7264. cur = build_lora_mm(model.output, cur);
  7265. cb(cur, "result_output", -1);
  7266. res->t_logits = cur;
  7267. ggml_build_forward_expand(gf, cur);
  7268. }
  7269. };
  7270. struct llm_build_mamba : public llm_graph_context {
  7271. const llama_model & model;
  7272. llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) {
  7273. ggml_tensor * cur;
  7274. ggml_tensor * inpL;
  7275. // {n_embd, n_tokens}
  7276. inpL = build_inp_embd(model.tok_embd);
  7277. auto * rs_inp = build_rs_inp();
  7278. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7279. for (int il = 0; il < n_layer; ++il) {
  7280. // norm
  7281. cur = build_norm(inpL,
  7282. model.layers[il].attn_norm, NULL,
  7283. LLM_NORM_RMS, il);
  7284. cb(cur, "attn_norm", il);
  7285. cur = build_mamba_layer(rs_inp, gf, cur, ubatch, il);
  7286. if (il == n_layer - 1 && inp_out_ids) {
  7287. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7288. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  7289. }
  7290. // residual
  7291. cur = ggml_add(ctx0, cur, inpL);
  7292. cur = build_cvec(cur, il);
  7293. cb(cur, "l_out", il);
  7294. // input for next layer
  7295. inpL = cur;
  7296. }
  7297. // final rmsnorm
  7298. cur = build_norm(inpL,
  7299. model.output_norm, NULL,
  7300. LLM_NORM_RMS, -1);
  7301. cb(cur, "result_norm", -1);
  7302. res->t_embd = cur;
  7303. // lm_head
  7304. cur = build_lora_mm(model.output, cur);
  7305. cb(cur, "result_output", -1);
  7306. res->t_logits = cur;
  7307. ggml_build_forward_expand(gf, cur);
  7308. }
  7309. // TODO: split
  7310. ggml_tensor * build_mamba_layer(
  7311. llm_graph_input_rs * inp,
  7312. ggml_cgraph * gf,
  7313. ggml_tensor * cur,
  7314. const llama_ubatch & ubatch,
  7315. int il) const {
  7316. const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
  7317. const auto kv_head = mctx_cur->get_head();
  7318. const int64_t d_conv = hparams.ssm_d_conv;
  7319. const int64_t d_inner = hparams.ssm_d_inner;
  7320. const int64_t d_state = hparams.ssm_d_state;
  7321. const int64_t dt_rank = hparams.ssm_dt_rank;
  7322. const int64_t n_seqs = ubatch.n_seqs;
  7323. // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
  7324. const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
  7325. // Use the same RMS norm as the final layer norm
  7326. const float norm_rms_eps = hparams.f_norm_rms_eps;
  7327. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  7328. GGML_ASSERT(n_seqs != 0);
  7329. GGML_ASSERT(ubatch.equal_seqs);
  7330. GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
  7331. ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
  7332. ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
  7333. // (ab)using the KV cache to store the states
  7334. ggml_tensor * conv = build_rs(
  7335. inp, gf, conv_states_all,
  7336. hparams.n_embd_r(), n_seqs);
  7337. conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
  7338. ggml_tensor * ssm = build_rs(
  7339. inp, gf, ssm_states_all,
  7340. hparams.n_embd_s(), n_seqs);
  7341. ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
  7342. // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
  7343. cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
  7344. // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
  7345. ggml_tensor * xz = build_lora_mm(model.layers[il].ssm_in, cur);
  7346. // split the above in two
  7347. // => {d_inner, n_seq_tokens, n_seqs}
  7348. ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
  7349. ggml_tensor * z = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz));
  7350. // conv
  7351. {
  7352. // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
  7353. ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
  7354. // copy last (d_conv - 1) columns back into the state cache
  7355. ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
  7356. ggml_build_forward_expand(gf,
  7357. ggml_cpy(ctx0, last_conv,
  7358. ggml_view_1d(ctx0, conv_states_all,
  7359. (d_conv - 1)*(d_inner)*(n_seqs),
  7360. kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
  7361. // 1D convolution
  7362. // The equivalent is to make a self-overlapping view of conv_x
  7363. // over d_conv columns at each stride in the 3rd dimension,
  7364. // then element-wise multiply that with the conv1d weight,
  7365. // then sum the elements of each row,
  7366. // (the last two steps are a dot product over rows (also doable with mul_mat))
  7367. // then permute away the ne[0] dimension,
  7368. // and then you're left with the resulting x tensor.
  7369. // For simultaneous sequences, all sequences need to have the same length.
  7370. x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
  7371. // bias
  7372. x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b);
  7373. x = ggml_silu(ctx0, x);
  7374. }
  7375. // ssm
  7376. {
  7377. // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
  7378. ggml_tensor * x_db = build_lora_mm(model.layers[il].ssm_x, x);
  7379. // split
  7380. ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
  7381. ggml_tensor * B = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
  7382. ggml_tensor * C = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
  7383. // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
  7384. if (ssm_dt_b_c_rms) {
  7385. dt = ggml_rms_norm(ctx0, dt, norm_rms_eps);
  7386. B = ggml_rms_norm(ctx0, B, norm_rms_eps);
  7387. C = ggml_rms_norm(ctx0, C, norm_rms_eps);
  7388. }
  7389. // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
  7390. dt = build_lora_mm(model.layers[il].ssm_dt, dt);
  7391. dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
  7392. // Custom operator to optimize the parallel associative scan
  7393. // as described in the Annex D of the Mamba paper.
  7394. // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
  7395. ggml_tensor * y_ssm = ggml_ssm_scan(ctx0, ssm, x, dt, model.layers[il].ssm_a, B, C);
  7396. // store last states
  7397. ggml_build_forward_expand(gf,
  7398. ggml_cpy(ctx0,
  7399. ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]),
  7400. ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
  7401. ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0);
  7402. // TODO: skip computing output earlier for unused tokens
  7403. // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
  7404. y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
  7405. y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z)));
  7406. // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
  7407. cur = build_lora_mm(model.layers[il].ssm_out, y);
  7408. }
  7409. // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
  7410. cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
  7411. //cb(cur, "mamba_out", il);
  7412. return cur;
  7413. }
  7414. };
  7415. struct llm_build_command_r : public llm_graph_context {
  7416. llm_build_command_r(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  7417. const int64_t n_embd_head = hparams.n_embd_head_v;
  7418. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7419. const float f_logit_scale = hparams.f_logit_scale;
  7420. ggml_tensor * cur;
  7421. ggml_tensor * inpL;
  7422. inpL = build_inp_embd(model.tok_embd);
  7423. // inp_pos - contains the positions
  7424. ggml_tensor * inp_pos = build_inp_pos();
  7425. auto * inp_attn = build_attn_inp_kv_unified();
  7426. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7427. for (int il = 0; il < n_layer; ++il) {
  7428. // norm
  7429. cur = build_norm(inpL,
  7430. model.layers[il].attn_norm, NULL,
  7431. LLM_NORM, il);
  7432. cb(cur, "attn_norm", il);
  7433. ggml_tensor * ffn_inp = cur;
  7434. // self-attention
  7435. {
  7436. // compute Q and K and RoPE them
  7437. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7438. cb(Qcur, "Qcur", il);
  7439. if (model.layers[il].bq) {
  7440. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  7441. cb(Qcur, "Qcur", il);
  7442. }
  7443. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7444. cb(Kcur, "Kcur", il);
  7445. if (model.layers[il].bk) {
  7446. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  7447. cb(Kcur, "Kcur", il);
  7448. }
  7449. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7450. cb(Vcur, "Vcur", il);
  7451. if (model.layers[il].bv) {
  7452. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  7453. cb(Vcur, "Vcur", il);
  7454. }
  7455. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7456. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7457. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7458. if (model.layers[il].attn_q_norm) {
  7459. Qcur = build_norm(Qcur,
  7460. model.layers[il].attn_q_norm,
  7461. NULL,
  7462. LLM_NORM, il);
  7463. cb(Qcur, "Qcur", il);
  7464. }
  7465. Qcur = ggml_rope_ext(
  7466. ctx0, Qcur, inp_pos, nullptr,
  7467. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7468. ext_factor, attn_factor, beta_fast, beta_slow
  7469. );
  7470. if (model.layers[il].attn_k_norm) {
  7471. Kcur = build_norm(Kcur,
  7472. model.layers[il].attn_k_norm,
  7473. NULL,
  7474. LLM_NORM, il);
  7475. cb(Kcur, "Kcur", il);
  7476. }
  7477. Kcur = ggml_rope_ext(
  7478. ctx0, Kcur, inp_pos, nullptr,
  7479. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7480. ext_factor, attn_factor, beta_fast, beta_slow
  7481. );
  7482. cb(Qcur, "Qcur", il);
  7483. cb(Kcur, "Kcur", il);
  7484. cb(Vcur, "Vcur", il);
  7485. cur = build_attn(inp_attn, gf,
  7486. model.layers[il].wo, model.layers[il].bo,
  7487. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7488. }
  7489. if (il == n_layer - 1 && inp_out_ids) {
  7490. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7491. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  7492. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  7493. }
  7494. ggml_tensor * attn_out = cur;
  7495. // feed-forward network
  7496. {
  7497. cur = build_ffn(ffn_inp,
  7498. model.layers[il].ffn_up, NULL, NULL,
  7499. model.layers[il].ffn_gate, NULL, NULL,
  7500. model.layers[il].ffn_down, NULL, NULL,
  7501. NULL,
  7502. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7503. cb(cur, "ffn_out", il);
  7504. }
  7505. // add together residual + FFN + self-attention
  7506. cur = ggml_add(ctx0, cur, inpL);
  7507. cur = ggml_add(ctx0, cur, attn_out);
  7508. cur = build_cvec(cur, il);
  7509. cb(cur, "l_out", il);
  7510. // input for next layer
  7511. inpL = cur;
  7512. }
  7513. cur = inpL;
  7514. cur = build_norm(cur,
  7515. model.output_norm, NULL,
  7516. LLM_NORM, -1);
  7517. cb(cur, "result_norm", -1);
  7518. res->t_embd = cur;
  7519. // lm_head
  7520. cur = build_lora_mm(model.output, cur);
  7521. if (f_logit_scale) {
  7522. cur = ggml_scale(ctx0, cur, f_logit_scale);
  7523. }
  7524. cb(cur, "result_output", -1);
  7525. res->t_logits = cur;
  7526. ggml_build_forward_expand(gf, cur);
  7527. }
  7528. };
  7529. struct llm_build_cohere2_iswa : public llm_graph_context {
  7530. llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  7531. const int64_t n_embd_head = hparams.n_embd_head_v;
  7532. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7533. const float f_logit_scale = hparams.f_logit_scale;
  7534. ggml_tensor * cur;
  7535. ggml_tensor * inpL;
  7536. inpL = build_inp_embd(model.tok_embd);
  7537. // inp_pos - contains the positions
  7538. ggml_tensor * inp_pos = build_inp_pos();
  7539. auto * inp_attn = build_attn_inp_kv_unified_iswa();
  7540. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7541. for (int il = 0; il < n_layer; ++il) {
  7542. const bool is_swa = hparams.is_swa(il);
  7543. // norm
  7544. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
  7545. cb(cur, "attn_norm", il);
  7546. ggml_tensor * ffn_inp = cur;
  7547. // self-attention
  7548. {
  7549. // rope freq factors for 128k context
  7550. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  7551. // compute Q and K and RoPE them
  7552. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7553. cb(Qcur, "Qcur", il);
  7554. if (model.layers[il].bq) {
  7555. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  7556. cb(Qcur, "Qcur", il);
  7557. }
  7558. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7559. cb(Kcur, "Kcur", il);
  7560. if (model.layers[il].bk) {
  7561. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  7562. cb(Kcur, "Kcur", il);
  7563. }
  7564. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7565. cb(Vcur, "Vcur", il);
  7566. if (model.layers[il].bv) {
  7567. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  7568. cb(Vcur, "Vcur", il);
  7569. }
  7570. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7571. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7572. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7573. if (is_swa) {
  7574. Qcur = ggml_rope_ext(
  7575. ctx0, Qcur, inp_pos, rope_factors,
  7576. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7577. ext_factor, attn_factor, beta_fast, beta_slow
  7578. );
  7579. Kcur = ggml_rope_ext(
  7580. ctx0, Kcur, inp_pos, rope_factors,
  7581. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7582. ext_factor, attn_factor, beta_fast, beta_slow
  7583. );
  7584. }
  7585. cb(Qcur, "Qcur", il);
  7586. cb(Kcur, "Kcur", il);
  7587. cb(Vcur, "Vcur", il);
  7588. cur = build_attn(inp_attn, gf,
  7589. model.layers[il].wo, model.layers[il].bo,
  7590. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7591. }
  7592. if (il == n_layer - 1 && inp_out_ids) {
  7593. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7594. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  7595. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  7596. }
  7597. ggml_tensor * attn_out = cur;
  7598. // feed-forward network
  7599. {
  7600. cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
  7601. NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR,
  7602. il);
  7603. cb(cur, "ffn_out", il);
  7604. }
  7605. // add together residual + FFN + self-attention
  7606. cur = ggml_add(ctx0, cur, inpL);
  7607. cur = ggml_add(ctx0, cur, attn_out);
  7608. cur = build_cvec(cur, il);
  7609. cb(cur, "l_out", il);
  7610. // input for next layer
  7611. inpL = cur;
  7612. }
  7613. cur = inpL;
  7614. cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1);
  7615. cb(cur, "result_norm", -1);
  7616. res->t_embd = cur;
  7617. // lm_head
  7618. cur = build_lora_mm(model.output, cur);
  7619. if (f_logit_scale) {
  7620. cur = ggml_scale(ctx0, cur, f_logit_scale);
  7621. }
  7622. cb(cur, "result_output", -1);
  7623. res->t_logits = cur;
  7624. ggml_build_forward_expand(gf, cur);
  7625. }
  7626. };
  7627. // ref: https://allenai.org/olmo
  7628. // based on the original build_llama() function, changes:
  7629. // * non-parametric layer norm
  7630. // * clamp qkv
  7631. // * removed bias
  7632. // * removed MoE
  7633. struct llm_build_olmo : public llm_graph_context {
  7634. llm_build_olmo(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  7635. const int64_t n_embd_head = hparams.n_embd_head_v;
  7636. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7637. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7638. ggml_tensor * cur;
  7639. ggml_tensor * inpL;
  7640. inpL = build_inp_embd(model.tok_embd);
  7641. // inp_pos - contains the positions
  7642. ggml_tensor * inp_pos = build_inp_pos();
  7643. auto * inp_attn = build_attn_inp_kv_unified();
  7644. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7645. for (int il = 0; il < n_layer; ++il) {
  7646. ggml_tensor * inpSA = inpL;
  7647. // norm
  7648. cur = build_norm(inpL,
  7649. NULL, NULL,
  7650. LLM_NORM, il);
  7651. cb(cur, "attn_norm", il);
  7652. // self-attention
  7653. {
  7654. // compute Q and K and RoPE them
  7655. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7656. cb(Qcur, "Qcur", il);
  7657. if (hparams.f_clamp_kqv > 0.0f) {
  7658. Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  7659. cb(Qcur, "Qcur", il);
  7660. }
  7661. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7662. cb(Kcur, "Kcur", il);
  7663. if (hparams.f_clamp_kqv > 0.0f) {
  7664. Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  7665. cb(Kcur, "Kcur", il);
  7666. }
  7667. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7668. cb(Vcur, "Vcur", il);
  7669. if (hparams.f_clamp_kqv > 0.0f) {
  7670. Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  7671. cb(Vcur, "Vcur", il);
  7672. }
  7673. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7674. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7675. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7676. Qcur = ggml_rope_ext(
  7677. ctx0, Qcur, inp_pos, nullptr,
  7678. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7679. ext_factor, attn_factor, beta_fast, beta_slow
  7680. );
  7681. Kcur = ggml_rope_ext(
  7682. ctx0, Kcur, inp_pos, nullptr,
  7683. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7684. ext_factor, attn_factor, beta_fast, beta_slow
  7685. );
  7686. cb(Qcur, "Qcur", il);
  7687. cb(Kcur, "Kcur", il);
  7688. cb(Vcur, "Vcur", il);
  7689. cur = build_attn(inp_attn, gf,
  7690. model.layers[il].wo, nullptr,
  7691. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7692. }
  7693. if (il == n_layer - 1 && inp_out_ids) {
  7694. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7695. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7696. }
  7697. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7698. cb(ffn_inp, "ffn_inp", il);
  7699. // feed-forward network
  7700. cur = build_norm(ffn_inp,
  7701. NULL, NULL,
  7702. LLM_NORM, il);
  7703. cb(cur, "ffn_norm", il);
  7704. cur = build_ffn(cur,
  7705. model.layers[il].ffn_up, NULL, NULL,
  7706. model.layers[il].ffn_gate, NULL, NULL,
  7707. model.layers[il].ffn_down, NULL, NULL,
  7708. NULL,
  7709. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7710. cb(cur, "ffn_out", il);
  7711. cur = ggml_add(ctx0, cur, ffn_inp);
  7712. cb(cur, "ffn_out", il);
  7713. cur = build_cvec(cur, il);
  7714. cb(cur, "l_out", il);
  7715. // input for next layer
  7716. inpL = cur;
  7717. }
  7718. cur = inpL;
  7719. cur = build_norm(cur,
  7720. NULL, NULL,
  7721. LLM_NORM, -1);
  7722. cb(cur, "result_norm", -1);
  7723. res->t_embd = cur;
  7724. // lm_head
  7725. cur = build_lora_mm(model.output, cur);
  7726. cb(cur, "result_output", -1);
  7727. res->t_logits = cur;
  7728. ggml_build_forward_expand(gf, cur);
  7729. }
  7730. };
  7731. struct llm_build_olmo2 : public llm_graph_context {
  7732. llm_build_olmo2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  7733. const int64_t n_embd_head = hparams.n_embd_head_v;
  7734. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7735. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7736. ggml_tensor * cur;
  7737. ggml_tensor * inpL;
  7738. inpL = build_inp_embd(model.tok_embd);
  7739. // inp_pos - contains the positions
  7740. ggml_tensor * inp_pos = build_inp_pos();
  7741. auto * inp_attn = build_attn_inp_kv_unified();
  7742. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7743. for (int il = 0; il < n_layer; ++il) {
  7744. ggml_tensor * inpSA = inpL;
  7745. cur = inpL;
  7746. // self_attention
  7747. {
  7748. // compute Q and K and RoPE them
  7749. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7750. cb(Qcur, "Qcur", il);
  7751. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7752. cb(Kcur, "Kcur", il);
  7753. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7754. cb(Vcur, "Vcur", il);
  7755. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
  7756. LLM_NORM_RMS, il);
  7757. cb(Qcur, "Qcur_normed", il);
  7758. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
  7759. LLM_NORM_RMS, il);
  7760. cb(Kcur, "Kcur_normed", il);
  7761. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7762. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7763. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7764. Qcur = ggml_rope_ext(
  7765. ctx0, Qcur, inp_pos, nullptr,
  7766. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7767. ext_factor, attn_factor, beta_fast, beta_slow
  7768. );
  7769. Kcur = ggml_rope_ext(
  7770. ctx0, Kcur, inp_pos, nullptr,
  7771. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7772. ext_factor, attn_factor, beta_fast, beta_slow
  7773. );
  7774. cb(Qcur, "Qcur", il);
  7775. cb(Kcur, "Kcur", il);
  7776. cb(Vcur, "Vcur", il);
  7777. cur = build_attn(inp_attn, gf,
  7778. model.layers[il].wo, NULL,
  7779. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7780. }
  7781. if (il == n_layer - 1 && inp_out_ids) {
  7782. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7783. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7784. }
  7785. cur = build_norm(cur,
  7786. model.layers[il].attn_post_norm, NULL,
  7787. LLM_NORM_RMS, il);
  7788. cb(cur, "attn_post_norm", il);
  7789. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7790. cb(ffn_inp, "ffn_inp", il);
  7791. // feed-forward network
  7792. cur = build_ffn(ffn_inp,
  7793. model.layers[il].ffn_up, NULL, NULL,
  7794. model.layers[il].ffn_gate, NULL, NULL,
  7795. model.layers[il].ffn_down, NULL, NULL,
  7796. NULL,
  7797. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7798. cb(cur, "ffn_out", il);
  7799. cur = build_norm(cur,
  7800. model.layers[il].ffn_post_norm, NULL,
  7801. LLM_NORM_RMS, -1);
  7802. cb(cur, "ffn_post_norm", -1);
  7803. cur = ggml_add(ctx0, cur, ffn_inp);
  7804. cb(cur, "ffn_out", il);
  7805. cur = build_cvec(cur, il);
  7806. cb(cur, "l_out", il);
  7807. // input for next layer
  7808. inpL = cur;
  7809. }
  7810. cur = inpL;
  7811. cur = build_norm(cur,
  7812. model.output_norm, NULL,
  7813. LLM_NORM_RMS, -1);
  7814. cb(cur, "result_norm", -1);
  7815. res->t_embd = cur;
  7816. // lm_head
  7817. cur = build_lora_mm(model.output, cur);
  7818. cb(cur, "result_output", -1);
  7819. res->t_logits = cur;
  7820. ggml_build_forward_expand(gf, cur);
  7821. }
  7822. };
  7823. // based on the build_qwen2moe() function, changes:
  7824. // * removed shared experts
  7825. // * removed bias
  7826. // * added q, k norm
  7827. struct llm_build_olmoe : public llm_graph_context {
  7828. llm_build_olmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  7829. const int64_t n_embd_head = hparams.n_embd_head_v;
  7830. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7831. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7832. ggml_tensor * cur;
  7833. ggml_tensor * inpL;
  7834. inpL = build_inp_embd(model.tok_embd);
  7835. // inp_pos - contains the positions
  7836. ggml_tensor * inp_pos = build_inp_pos();
  7837. auto * inp_attn = build_attn_inp_kv_unified();
  7838. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7839. for (int il = 0; il < n_layer; ++il) {
  7840. ggml_tensor * inpSA = inpL;
  7841. // norm
  7842. cur = build_norm(inpL,
  7843. model.layers[il].attn_norm, NULL,
  7844. LLM_NORM_RMS, il);
  7845. cb(cur, "attn_norm", il);
  7846. // self_attention
  7847. {
  7848. // compute Q and K and RoPE them
  7849. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7850. cb(Qcur, "Qcur", il);
  7851. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7852. cb(Kcur, "Kcur", il);
  7853. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7854. cb(Vcur, "Vcur", il);
  7855. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
  7856. LLM_NORM_RMS, il);
  7857. cb(Qcur, "Qcur_normed", il);
  7858. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
  7859. LLM_NORM_RMS, il);
  7860. cb(Kcur, "Kcur_normed", il);
  7861. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7862. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7863. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7864. Qcur = ggml_rope_ext(
  7865. ctx0, Qcur, inp_pos, nullptr,
  7866. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7867. ext_factor, attn_factor, beta_fast, beta_slow
  7868. );
  7869. Kcur = ggml_rope_ext(
  7870. ctx0, Kcur, inp_pos, nullptr,
  7871. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7872. ext_factor, attn_factor, beta_fast, beta_slow
  7873. );
  7874. cb(Qcur, "Qcur", il);
  7875. cb(Kcur, "Kcur", il);
  7876. cb(Vcur, "Vcur", il);
  7877. cur = build_attn(inp_attn, gf,
  7878. model.layers[il].wo, NULL,
  7879. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7880. }
  7881. if (il == n_layer - 1 && inp_out_ids) {
  7882. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7883. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7884. }
  7885. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7886. cb(ffn_inp, "ffn_inp", il);
  7887. // MoE branch
  7888. cur = build_norm(ffn_inp,
  7889. model.layers[il].ffn_norm, NULL,
  7890. LLM_NORM_RMS, il);
  7891. cb(cur, "ffn_norm", il);
  7892. cur = build_moe_ffn(cur,
  7893. model.layers[il].ffn_gate_inp,
  7894. model.layers[il].ffn_up_exps,
  7895. model.layers[il].ffn_gate_exps,
  7896. model.layers[il].ffn_down_exps,
  7897. nullptr,
  7898. n_expert, n_expert_used,
  7899. LLM_FFN_SILU, false,
  7900. false, 0.0,
  7901. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  7902. il);
  7903. cb(cur, "ffn_moe_out", il);
  7904. cur = ggml_add(ctx0, cur, ffn_inp);
  7905. cur = build_cvec(cur, il);
  7906. cb(cur, "l_out", il);
  7907. // input for next layer
  7908. inpL = cur;
  7909. }
  7910. cur = inpL;
  7911. cur = build_norm(cur,
  7912. model.output_norm, NULL,
  7913. LLM_NORM_RMS, -1);
  7914. cb(cur, "result_norm", -1);
  7915. res->t_embd = cur;
  7916. // lm_head
  7917. cur = build_lora_mm(model.output, cur);
  7918. cb(cur, "result_output", -1);
  7919. res->t_logits = cur;
  7920. ggml_build_forward_expand(gf, cur);
  7921. }
  7922. };
  7923. struct llm_build_openelm : public llm_graph_context {
  7924. llm_build_openelm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  7925. const int64_t n_embd_head = hparams.n_embd_head_v;
  7926. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7927. ggml_tensor * cur;
  7928. ggml_tensor * inpL;
  7929. inpL = build_inp_embd(model.tok_embd);
  7930. // inp_pos - contains the positions
  7931. ggml_tensor * inp_pos = build_inp_pos();
  7932. auto * inp_attn = build_attn_inp_kv_unified();
  7933. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7934. for (int il = 0; il < n_layer; ++il) {
  7935. const int64_t n_head = hparams.n_head(il);
  7936. const int64_t n_head_kv = hparams.n_head_kv(il);
  7937. const int64_t n_head_qkv = 2*n_head_kv + n_head;
  7938. cur = inpL;
  7939. ggml_tensor * residual = cur;
  7940. // norm
  7941. cur = build_norm(inpL,
  7942. model.layers[il].attn_norm, NULL,
  7943. LLM_NORM_RMS, il);
  7944. cb(cur, "attn_norm", il);
  7945. // self-attention
  7946. {
  7947. cur = build_lora_mm(model.layers[il].wqkv, cur);
  7948. cb(cur, "wqkv", il);
  7949. cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
  7950. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0));
  7951. cb(Qcur, "Qcur", il);
  7952. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head));
  7953. cb(Kcur, "Kcur", il);
  7954. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
  7955. cb(Vcur, "Vcur", il);
  7956. Qcur = build_norm(Qcur,
  7957. model.layers[il].attn_q_norm, NULL,
  7958. LLM_NORM_RMS, il);
  7959. cb(Qcur, "Qcur", il);
  7960. Kcur = build_norm(Kcur,
  7961. model.layers[il].attn_k_norm, NULL,
  7962. LLM_NORM_RMS, il);
  7963. cb(Kcur, "Kcur", il);
  7964. Qcur = ggml_rope_ext(
  7965. ctx0, Qcur, inp_pos, NULL,
  7966. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7967. ext_factor, attn_factor, beta_fast, beta_slow
  7968. );
  7969. Kcur = ggml_rope_ext(
  7970. ctx0, Kcur, inp_pos, NULL,
  7971. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7972. ext_factor, attn_factor, beta_fast, beta_slow
  7973. );
  7974. cb(Qcur, "Qcur", il);
  7975. cb(Kcur, "Kcur", il);
  7976. cb(Qcur, "Vcur", il);
  7977. cur = build_attn(inp_attn, gf,
  7978. model.layers[il].wo, NULL,
  7979. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7980. }
  7981. if (il == n_layer - 1 && inp_out_ids) {
  7982. residual = ggml_get_rows(ctx0, residual, inp_out_ids);
  7983. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7984. }
  7985. ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
  7986. cb(ffn_inp, "ffn_inp", il);
  7987. // feed-forward network
  7988. {
  7989. cur = build_norm(ffn_inp,
  7990. model.layers[il].ffn_norm, NULL,
  7991. LLM_NORM_RMS, il);
  7992. cb(cur, "ffn_norm", il);
  7993. cur = build_ffn(cur,
  7994. model.layers[il].ffn_up, NULL, NULL,
  7995. model.layers[il].ffn_gate, NULL, NULL,
  7996. model.layers[il].ffn_down, NULL, NULL,
  7997. NULL,
  7998. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7999. cb(cur, "ffn_out", il);
  8000. }
  8001. cur = ggml_add(ctx0, cur, ffn_inp);
  8002. cur = build_cvec(cur, il);
  8003. cb(cur, "l_out", il);
  8004. inpL = cur;
  8005. }
  8006. cur = inpL;
  8007. // norm
  8008. cur = build_norm(cur,
  8009. model.output_norm, NULL,
  8010. LLM_NORM_RMS, -1);
  8011. cb(cur, "result_norm", -1);
  8012. res->t_embd = cur;
  8013. cur = build_lora_mm(model.output, cur);
  8014. cb(cur, "result_output", -1);
  8015. res->t_logits = cur;
  8016. ggml_build_forward_expand(gf, cur);
  8017. }
  8018. };
  8019. struct llm_build_gptneox : public llm_graph_context {
  8020. llm_build_gptneox(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  8021. const int64_t n_embd_head = hparams.n_embd_head_v;
  8022. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  8023. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8024. ggml_tensor * cur;
  8025. ggml_tensor * inpL;
  8026. inpL = build_inp_embd(model.tok_embd);
  8027. // inp_pos - contains the positions
  8028. ggml_tensor * inp_pos = build_inp_pos();
  8029. auto * inp_attn = build_attn_inp_kv_unified();
  8030. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8031. for (int il = 0; il < n_layer; ++il) {
  8032. cur = build_norm(inpL,
  8033. model.layers[il].attn_norm,
  8034. model.layers[il].attn_norm_b,
  8035. LLM_NORM, il);
  8036. cb(cur, "attn_norm", il);
  8037. // self-attention
  8038. {
  8039. cur = build_lora_mm(model.layers[il].wqkv, cur);
  8040. cb(cur, "wqkv", il);
  8041. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  8042. cb(cur, "bqkv", il);
  8043. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  8044. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  8045. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  8046. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8047. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8048. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8049. Qcur = ggml_rope_ext(
  8050. ctx0, Qcur, inp_pos, nullptr,
  8051. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8052. ext_factor, attn_factor, beta_fast, beta_slow
  8053. );
  8054. Kcur = ggml_rope_ext(
  8055. ctx0, Kcur, inp_pos, nullptr,
  8056. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8057. ext_factor, attn_factor, beta_fast, beta_slow
  8058. );
  8059. cb(Qcur, "Qcur", il);
  8060. cb(Kcur, "Kcur", il);
  8061. cb(Vcur, "Vcur", il);
  8062. cur = build_attn(inp_attn, gf,
  8063. model.layers[il].wo, model.layers[il].bo,
  8064. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  8065. }
  8066. if (il == n_layer - 1 && inp_out_ids) {
  8067. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8068. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  8069. }
  8070. // ffn
  8071. if (hparams.use_par_res) {
  8072. // attention and ffn are computed in parallel
  8073. // x = x + attn(ln1(x)) + ffn(ln2(x))
  8074. ggml_tensor * attn_out = cur;
  8075. cur = build_norm(inpL,
  8076. model.layers[il].ffn_norm,
  8077. model.layers[il].ffn_norm_b,
  8078. LLM_NORM, il);
  8079. cb(cur, "ffn_norm", il);
  8080. cur = build_ffn(cur,
  8081. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  8082. NULL, NULL, NULL,
  8083. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  8084. NULL,
  8085. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  8086. cb(cur, "ffn_out", il);
  8087. cur = ggml_add(ctx0, cur, inpL);
  8088. cb(cur, "ffn_out", il);
  8089. cur = ggml_add(ctx0, cur, attn_out);
  8090. cur = build_cvec(cur, il);
  8091. cb(cur, "l_out", il);
  8092. // input for next layer
  8093. inpL = cur;
  8094. } else {
  8095. // attention and ffn are computed sequentially
  8096. // x = x + attn(ln1(x))
  8097. // x = x + ffn(ln2(x))
  8098. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  8099. cb(ffn_inp, "ffn_inp", il);
  8100. cur = build_norm(ffn_inp,
  8101. model.layers[il].ffn_norm,
  8102. model.layers[il].ffn_norm_b,
  8103. LLM_NORM, il);
  8104. cb(cur, "ffn_norm", il);
  8105. cur = build_ffn(cur,
  8106. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  8107. NULL, NULL, NULL,
  8108. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  8109. NULL,
  8110. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  8111. cb(cur, "ffn_out", il);
  8112. cur = ggml_add(ctx0, cur, ffn_inp);
  8113. cur = build_cvec(cur, il);
  8114. cb(cur, "l_out", il);
  8115. // input for next layer
  8116. inpL = cur;
  8117. }
  8118. }
  8119. cur = build_norm(inpL,
  8120. model.output_norm,
  8121. model.output_norm_b,
  8122. LLM_NORM, -1);
  8123. cb(cur, "result_norm", -1);
  8124. res->t_embd = cur;
  8125. cur = build_lora_mm(model.output, cur);
  8126. cb(cur, "result_output", -1);
  8127. res->t_logits = cur;
  8128. ggml_build_forward_expand(gf, cur);
  8129. }
  8130. };
  8131. struct llm_build_arctic : public llm_graph_context {
  8132. llm_build_arctic(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  8133. const int64_t n_embd_head = hparams.n_embd_head_v;
  8134. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8135. GGML_ASSERT(n_embd_head == hparams.n_rot);
  8136. ggml_tensor * cur;
  8137. ggml_tensor * inpL;
  8138. inpL = build_inp_embd(model.tok_embd);
  8139. // inp_pos - contains the positions
  8140. ggml_tensor * inp_pos = build_inp_pos();
  8141. auto * inp_attn = build_attn_inp_kv_unified();
  8142. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8143. for (int il = 0; il < n_layer; ++il) {
  8144. ggml_tensor * inpSA = inpL;
  8145. // norm
  8146. cur = build_norm(inpL,
  8147. model.layers[il].attn_norm, NULL,
  8148. LLM_NORM_RMS, il);
  8149. cb(cur, "attn_norm", il);
  8150. // self-attention
  8151. {
  8152. // compute Q and K and RoPE them
  8153. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8154. cb(Qcur, "Qcur", il);
  8155. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8156. cb(Kcur, "Kcur", il);
  8157. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8158. cb(Vcur, "Vcur", il);
  8159. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8160. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8161. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8162. Qcur = ggml_rope_ext(
  8163. ctx0, Qcur, inp_pos, nullptr,
  8164. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8165. ext_factor, attn_factor, beta_fast, beta_slow
  8166. );
  8167. Kcur = ggml_rope_ext(
  8168. ctx0, Kcur, inp_pos, nullptr,
  8169. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8170. ext_factor, attn_factor, beta_fast, beta_slow
  8171. );
  8172. cb(Qcur, "Qcur", il);
  8173. cb(Kcur, "Kcur", il);
  8174. cb(Vcur, "Vcur", il);
  8175. cur = build_attn(inp_attn, gf,
  8176. model.layers[il].wo, NULL,
  8177. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  8178. }
  8179. if (il == n_layer - 1 && inp_out_ids) {
  8180. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8181. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  8182. }
  8183. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  8184. cb(ffn_inp, "ffn_inp", il);
  8185. // feed-forward network
  8186. cur = build_norm(ffn_inp,
  8187. model.layers[il].ffn_norm, NULL,
  8188. LLM_NORM_RMS, il);
  8189. cb(cur, "ffn_norm", il);
  8190. cur = build_ffn(cur,
  8191. model.layers[il].ffn_up, NULL, NULL,
  8192. model.layers[il].ffn_gate, NULL, NULL,
  8193. model.layers[il].ffn_down, NULL, NULL,
  8194. NULL,
  8195. LLM_FFN_SILU, LLM_FFN_PAR, il);
  8196. cb(cur, "ffn_out", il);
  8197. ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
  8198. cb(ffn_out, "ffn_out", il);
  8199. // MoE
  8200. cur = build_norm(inpSA,
  8201. model.layers[il].ffn_norm_exps, NULL,
  8202. LLM_NORM_RMS, il);
  8203. cb(cur, "ffn_norm_exps", il);
  8204. cur = build_moe_ffn(cur,
  8205. model.layers[il].ffn_gate_inp,
  8206. model.layers[il].ffn_up_exps,
  8207. model.layers[il].ffn_gate_exps,
  8208. model.layers[il].ffn_down_exps,
  8209. nullptr,
  8210. n_expert, n_expert_used,
  8211. LLM_FFN_SILU, true,
  8212. false, 0.0,
  8213. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  8214. il);
  8215. cb(cur, "ffn_moe_out", il);
  8216. cur = ggml_add(ctx0, cur, ffn_out);
  8217. cb(cur, "ffn_out", il);
  8218. cur = build_cvec(cur, il);
  8219. cb(cur, "l_out", il);
  8220. // input for next layer
  8221. inpL = cur;
  8222. }
  8223. cur = inpL;
  8224. cur = build_norm(cur,
  8225. model.output_norm, NULL,
  8226. LLM_NORM_RMS, -1);
  8227. cb(cur, "result_norm", -1);
  8228. res->t_embd = cur;
  8229. // lm_head
  8230. cur = build_lora_mm(model.output, cur);
  8231. cb(cur, "result_output", -1);
  8232. res->t_logits = cur;
  8233. ggml_build_forward_expand(gf, cur);
  8234. }
  8235. };
  8236. struct llm_build_deepseek : public llm_graph_context {
  8237. llm_build_deepseek(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  8238. const int64_t n_embd_head = hparams.n_embd_head_v;
  8239. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8240. GGML_ASSERT(n_embd_head == hparams.n_rot);
  8241. ggml_tensor * cur;
  8242. ggml_tensor * inpL;
  8243. inpL = build_inp_embd(model.tok_embd);
  8244. // inp_pos - contains the positions
  8245. ggml_tensor * inp_pos = build_inp_pos();
  8246. auto * inp_attn = build_attn_inp_kv_unified();
  8247. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  8248. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8249. for (int il = 0; il < n_layer; ++il) {
  8250. ggml_tensor * inpSA = inpL;
  8251. // norm
  8252. cur = build_norm(inpL,
  8253. model.layers[il].attn_norm, NULL,
  8254. LLM_NORM_RMS, il);
  8255. cb(cur, "attn_norm", il);
  8256. // self-attention
  8257. {
  8258. // rope freq factors for llama3; may return nullptr for llama2 and other models
  8259. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  8260. // compute Q and K and RoPE them
  8261. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8262. cb(Qcur, "Qcur", il);
  8263. if (model.layers[il].bq) {
  8264. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  8265. cb(Qcur, "Qcur", il);
  8266. }
  8267. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8268. cb(Kcur, "Kcur", il);
  8269. if (model.layers[il].bk) {
  8270. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  8271. cb(Kcur, "Kcur", il);
  8272. }
  8273. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8274. cb(Vcur, "Vcur", il);
  8275. if (model.layers[il].bv) {
  8276. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  8277. cb(Vcur, "Vcur", il);
  8278. }
  8279. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8280. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8281. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8282. Qcur = ggml_rope_ext(
  8283. ctx0, Qcur, inp_pos, rope_factors,
  8284. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8285. ext_factor, attn_factor, beta_fast, beta_slow
  8286. );
  8287. Kcur = ggml_rope_ext(
  8288. ctx0, Kcur, inp_pos, rope_factors,
  8289. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8290. ext_factor, attn_factor, beta_fast, beta_slow
  8291. );
  8292. cb(Qcur, "Qcur", il);
  8293. cb(Kcur, "Kcur", il);
  8294. cb(Vcur, "Vcur", il);
  8295. cur = build_attn(inp_attn, gf,
  8296. model.layers[il].wo, model.layers[il].bo,
  8297. Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
  8298. }
  8299. if (il == n_layer - 1 && inp_out_ids) {
  8300. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8301. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  8302. }
  8303. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  8304. cb(ffn_inp, "ffn_inp", il);
  8305. cur = build_norm(ffn_inp,
  8306. model.layers[il].ffn_norm, NULL,
  8307. LLM_NORM_RMS, il);
  8308. cb(cur, "ffn_norm", il);
  8309. if ((uint32_t) il < hparams.n_layer_dense_lead) {
  8310. cur = build_ffn(cur,
  8311. model.layers[il].ffn_up, NULL, NULL,
  8312. model.layers[il].ffn_gate, NULL, NULL,
  8313. model.layers[il].ffn_down, NULL, NULL,
  8314. NULL,
  8315. LLM_FFN_SILU, LLM_FFN_PAR, il);
  8316. cb(cur, "ffn_out", il);
  8317. } else {
  8318. // MoE branch
  8319. ggml_tensor * moe_out =
  8320. build_moe_ffn(cur,
  8321. model.layers[il].ffn_gate_inp,
  8322. model.layers[il].ffn_up_exps,
  8323. model.layers[il].ffn_gate_exps,
  8324. model.layers[il].ffn_down_exps,
  8325. nullptr,
  8326. n_expert, n_expert_used,
  8327. LLM_FFN_SILU, false,
  8328. false, hparams.expert_weights_scale,
  8329. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  8330. il);
  8331. cb(moe_out, "ffn_moe_out", il);
  8332. // FFN shared expert
  8333. {
  8334. ggml_tensor * ffn_shexp = build_ffn(cur,
  8335. model.layers[il].ffn_up_shexp, NULL, NULL,
  8336. model.layers[il].ffn_gate_shexp, NULL, NULL,
  8337. model.layers[il].ffn_down_shexp, NULL, NULL,
  8338. NULL,
  8339. LLM_FFN_SILU, LLM_FFN_PAR, il);
  8340. cb(ffn_shexp, "ffn_shexp", il);
  8341. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  8342. cb(cur, "ffn_out", il);
  8343. }
  8344. }
  8345. cur = ggml_add(ctx0, cur, ffn_inp);
  8346. cur = build_cvec(cur, il);
  8347. cb(cur, "l_out", il);
  8348. // input for next layer
  8349. inpL = cur;
  8350. }
  8351. cur = inpL;
  8352. cur = build_norm(cur,
  8353. model.output_norm, NULL,
  8354. LLM_NORM_RMS, -1);
  8355. cb(cur, "result_norm", -1);
  8356. res->t_embd = cur;
  8357. // lm_head
  8358. cur = build_lora_mm(model.output, cur);
  8359. cb(cur, "result_output", -1);
  8360. res->t_logits = cur;
  8361. ggml_build_forward_expand(gf, cur);
  8362. }
  8363. };
  8364. struct llm_build_deepseek2 : public llm_graph_context {
  8365. llm_build_deepseek2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  8366. bool is_lite = (hparams.n_layer == 27);
  8367. const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
  8368. // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
  8369. const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
  8370. const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
  8371. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  8372. const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
  8373. const uint32_t kv_lora_rank = hparams.n_lora_kv;
  8374. // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
  8375. // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
  8376. const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
  8377. const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(n_embd_head_k));
  8378. const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
  8379. ggml_tensor * cur;
  8380. ggml_tensor * inpL;
  8381. // {n_embd, n_tokens}
  8382. inpL = build_inp_embd(model.tok_embd);
  8383. // inp_pos - contains the positions
  8384. ggml_tensor * inp_pos = build_inp_pos();
  8385. auto * inp_attn = build_attn_inp_kv_unified();
  8386. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8387. for (int il = 0; il < n_layer; ++il) {
  8388. ggml_tensor * inpSA = inpL;
  8389. // norm
  8390. cur = build_norm(inpL,
  8391. model.layers[il].attn_norm, NULL,
  8392. LLM_NORM_RMS, il);
  8393. cb(cur, "attn_norm", il);
  8394. // self_attention
  8395. {
  8396. ggml_tensor * q = NULL;
  8397. if (!is_lite) {
  8398. q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
  8399. cb(q, "q", il);
  8400. q = build_norm(q,
  8401. model.layers[il].attn_q_a_norm, nullptr,
  8402. LLM_NORM_RMS, il);
  8403. cb(q, "q", il);
  8404. q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
  8405. cb(q, "q", il);
  8406. } else {
  8407. q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
  8408. cb(q, "q", il);
  8409. }
  8410. // split into {n_embd_head_qk_nope, n_head, n_tokens}
  8411. ggml_tensor * q_nope = ggml_view_3d(ctx0, q,
  8412. n_embd_head_qk_nope, n_head, n_tokens,
  8413. ggml_row_size(q->type, n_embd_head_k),
  8414. ggml_row_size(q->type, n_embd_head_k) * n_head,
  8415. 0);
  8416. cb(q_nope, "q_nope", il);
  8417. // and {n_embd_head_qk_rope, n_head, n_tokens}
  8418. ggml_tensor * q_pe = ggml_view_3d(ctx0, q,
  8419. n_embd_head_qk_rope, n_head, n_tokens,
  8420. ggml_row_size(q->type, n_embd_head_k),
  8421. ggml_row_size(q->type, n_embd_head_k) * n_head,
  8422. ggml_row_size(q->type, n_embd_head_qk_nope));
  8423. cb(q_pe, "q_pe", il);
  8424. ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
  8425. cb(kv_cmpr_pe, "kv_cmpr_pe", il);
  8426. // split into {kv_lora_rank, n_tokens}
  8427. ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_cmpr_pe,
  8428. kv_lora_rank, n_tokens,
  8429. ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
  8430. 0);
  8431. cb(kv_cmpr, "kv_cmpr", il);
  8432. // and {n_embd_head_qk_rope, 1, n_tokens}
  8433. ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe,
  8434. n_embd_head_qk_rope, 1, n_tokens,
  8435. ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
  8436. ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
  8437. ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
  8438. cb(k_pe, "k_pe", il);
  8439. q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr,
  8440. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8441. ext_factor, attn_factor, beta_fast, beta_slow
  8442. );
  8443. cb(q_pe, "q_pe", il);
  8444. k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr,
  8445. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8446. ext_factor, attn_factor, beta_fast, beta_slow
  8447. );
  8448. cb(k_pe, "k_pe", il);
  8449. kv_cmpr = build_norm(kv_cmpr,
  8450. model.layers[il].attn_kv_a_norm, nullptr,
  8451. LLM_NORM_RMS, il);
  8452. cb(kv_cmpr, "kv_cmpr", il);
  8453. if (is_mla) {
  8454. // {n_embd_head_qk_nope, n_tokens, n_head}
  8455. q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
  8456. cb(q_nope, "q_nope_perm", il);
  8457. // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
  8458. ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
  8459. cb(q_nope_absorbed, "q_nope_absorbed", il);
  8460. // {kv_lora_rank, n_head, n_tokens}
  8461. q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
  8462. cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
  8463. // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
  8464. // note: rope must go first for in-place context shifting in build_rope_shift()
  8465. ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
  8466. cb(Qcur, "Qcur", il);
  8467. kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
  8468. cb(kv_cmpr, "kv_cmpr_reshape", il);
  8469. // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
  8470. ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
  8471. cb(Kcur, "Kcur", il);
  8472. // {kv_lora_rank, 1, n_tokens}
  8473. ggml_tensor * Vcur = kv_cmpr;
  8474. cb(Vcur, "Vcur", il);
  8475. // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
  8476. cur = build_attn(inp_attn, gf,
  8477. model.layers[il].wo, NULL,
  8478. Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
  8479. } else {
  8480. ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
  8481. cb(kv, "kv", il);
  8482. // split into {n_embd_head_qk_nope, n_head, n_tokens}
  8483. ggml_tensor * k_nope = ggml_view_3d(ctx0, kv,
  8484. n_embd_head_qk_nope, n_head, n_tokens,
  8485. ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
  8486. ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
  8487. 0);
  8488. cb(k_nope, "k_nope_view", il);
  8489. // and {n_embd_head_v, n_head, n_tokens}
  8490. ggml_tensor * Vcur = ggml_view_3d(ctx0, kv,
  8491. n_embd_head_v, n_head, n_tokens,
  8492. ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
  8493. ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
  8494. ggml_row_size(kv->type, n_embd_head_qk_nope));
  8495. cb(Vcur, "Vcur_view", il);
  8496. Vcur = ggml_cont(ctx0, Vcur);
  8497. cb(Vcur, "Vcur_cont", il);
  8498. // note: rope must go first for in-place context shifting in build_rope_shift()
  8499. ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0);
  8500. cb(Qcur, "Qcur", il);
  8501. ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
  8502. cb(Kcur, "Kcur", il);
  8503. // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
  8504. cur = build_attn(inp_attn, gf,
  8505. model.layers[il].wo, NULL,
  8506. Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
  8507. }
  8508. }
  8509. if (il == n_layer - 1 && inp_out_ids) {
  8510. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8511. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  8512. }
  8513. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  8514. cb(ffn_inp, "ffn_inp", il);
  8515. cur = build_norm(ffn_inp,
  8516. model.layers[il].ffn_norm, NULL,
  8517. LLM_NORM_RMS, il);
  8518. cb(cur, "ffn_norm", il);
  8519. if ((uint32_t) il < hparams.n_layer_dense_lead) {
  8520. cur = build_ffn(cur,
  8521. model.layers[il].ffn_up, NULL, NULL,
  8522. model.layers[il].ffn_gate, NULL, NULL,
  8523. model.layers[il].ffn_down, NULL, NULL,
  8524. NULL,
  8525. LLM_FFN_SILU, LLM_FFN_PAR, il);
  8526. cb(cur, "ffn_out", il);
  8527. } else {
  8528. // MoE branch
  8529. ggml_tensor * moe_out =
  8530. build_moe_ffn(cur,
  8531. model.layers[il].ffn_gate_inp,
  8532. model.layers[il].ffn_up_exps,
  8533. model.layers[il].ffn_gate_exps,
  8534. model.layers[il].ffn_down_exps,
  8535. model.layers[il].ffn_exp_probs_b,
  8536. n_expert, n_expert_used,
  8537. LLM_FFN_SILU, hparams.expert_weights_norm,
  8538. true, hparams.expert_weights_scale,
  8539. (llama_expert_gating_func_type) hparams.expert_gating_func,
  8540. il);
  8541. cb(moe_out, "ffn_moe_out", il);
  8542. // FFN shared expert
  8543. {
  8544. ggml_tensor * ffn_shexp = build_ffn(cur,
  8545. model.layers[il].ffn_up_shexp, NULL, NULL,
  8546. model.layers[il].ffn_gate_shexp, NULL, NULL,
  8547. model.layers[il].ffn_down_shexp, NULL, NULL,
  8548. NULL,
  8549. LLM_FFN_SILU, LLM_FFN_PAR, il);
  8550. cb(ffn_shexp, "ffn_shexp", il);
  8551. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  8552. cb(cur, "ffn_out", il);
  8553. }
  8554. }
  8555. cur = ggml_add(ctx0, cur, ffn_inp);
  8556. cur = build_cvec(cur, il);
  8557. cb(cur, "l_out", il);
  8558. // input for next layer
  8559. inpL = cur;
  8560. }
  8561. cur = inpL;
  8562. cur = build_norm(cur,
  8563. model.output_norm, NULL,
  8564. LLM_NORM_RMS, -1);
  8565. cb(cur, "result_norm", -1);
  8566. res->t_embd = cur;
  8567. // lm_head
  8568. cur = ggml_mul_mat(ctx0, model.output, cur);
  8569. cb(cur, "result_output", -1);
  8570. res->t_logits = cur;
  8571. ggml_build_forward_expand(gf, cur);
  8572. }
  8573. };
  8574. struct llm_build_bitnet : public llm_graph_context {
  8575. llm_build_bitnet(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  8576. const int64_t n_embd_head = hparams.n_embd_head_v;
  8577. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8578. ggml_tensor * cur;
  8579. ggml_tensor * inpL;
  8580. inpL = build_inp_embd(model.tok_embd);
  8581. // inp_pos - contains the positions
  8582. ggml_tensor * inp_pos = build_inp_pos();
  8583. auto * inp_attn = build_attn_inp_kv_unified();
  8584. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8585. for (int il = 0; il < n_layer; ++il) {
  8586. ggml_tensor * inpSA = inpL;
  8587. cur = build_norm(inpL,
  8588. model.layers[il].attn_norm, NULL,
  8589. LLM_NORM_RMS, il);
  8590. cb(cur, "attn_norm", il);
  8591. // self-attention
  8592. {
  8593. // compute Q and K and RoPE them
  8594. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8595. if (model.layers[il].wq_scale) {
  8596. Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
  8597. }
  8598. cb(Qcur, "Qcur", il);
  8599. if (model.layers[il].bq) {
  8600. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  8601. cb(Qcur, "Qcur", il);
  8602. }
  8603. // B1.K
  8604. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8605. if (model.layers[il].wk_scale) {
  8606. Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
  8607. }
  8608. cb(Kcur, "Kcur", il);
  8609. if (model.layers[il].bk) {
  8610. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  8611. cb(Kcur, "Kcur", il);
  8612. }
  8613. // B1.V
  8614. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8615. if (model.layers[il].wv_scale) {
  8616. Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
  8617. }
  8618. cb(Vcur, "Vcur", il);
  8619. if (model.layers[il].bv) {
  8620. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  8621. cb(Vcur, "Vcur", il);
  8622. }
  8623. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8624. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8625. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8626. Qcur = ggml_rope_ext(
  8627. ctx0, Qcur, inp_pos, nullptr,
  8628. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8629. ext_factor, attn_factor, beta_fast, beta_slow
  8630. );
  8631. Kcur = ggml_rope_ext(
  8632. ctx0, Kcur, inp_pos, nullptr,
  8633. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8634. ext_factor, attn_factor, beta_fast, beta_slow
  8635. );
  8636. cb(Qcur, "Qcur", il);
  8637. cb(Kcur, "Kcur", il);
  8638. cb(Vcur, "Vcur", il);
  8639. cur = build_attn(inp_attn, gf,
  8640. NULL, NULL,
  8641. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  8642. cur = build_norm(cur,
  8643. model.layers[il].attn_sub_norm, NULL,
  8644. LLM_NORM_RMS, il);
  8645. cb(cur, "attn_sub_norm", il);
  8646. cur = build_lora_mm(model.layers[il].wo, cur);
  8647. if (model.layers[il].wo_scale) {
  8648. cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
  8649. }
  8650. if (model.layers[il].bo) {
  8651. cur = ggml_add(ctx0, cur, model.layers[il].bo);
  8652. }
  8653. cb(cur, "attn_o_out", il);
  8654. }
  8655. if (il == n_layer - 1 && inp_out_ids) {
  8656. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8657. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  8658. }
  8659. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  8660. cb(ffn_inp, "ffn_inp", il);
  8661. // feed-forward forward
  8662. cur = build_norm(ffn_inp,
  8663. model.layers[il].ffn_norm, NULL,
  8664. LLM_NORM_RMS, il);
  8665. cb(cur, "ffn_norm", il);
  8666. cur = build_ffn(cur,
  8667. model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale,
  8668. model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
  8669. NULL, NULL, NULL,
  8670. NULL,
  8671. LLM_FFN_SILU, LLM_FFN_PAR, il);
  8672. cb(cur, "ffn_sub_out", il);
  8673. cur = build_norm(cur,
  8674. model.layers[il].ffn_sub_norm, NULL,
  8675. LLM_NORM_RMS, il);
  8676. cb(cur, "ffn_sub_norm", il);
  8677. cur = build_lora_mm(model.layers[il].ffn_down, cur);
  8678. if (model.layers[il].ffn_down_scale) {
  8679. cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
  8680. }
  8681. cb(cur, "ffn_down", il);
  8682. cur = ggml_add(ctx0, cur, ffn_inp);
  8683. cb(cur, "l_out", il);
  8684. // input for next layer
  8685. inpL = cur;
  8686. }
  8687. cur = inpL;
  8688. cur = build_norm(cur,
  8689. model.output_norm, NULL,
  8690. LLM_NORM_RMS, -1);
  8691. cb(cur, "result_norm", -1);
  8692. res->t_embd = cur;
  8693. // lm_head
  8694. // FIXME: do not use model.tok_embd directly, duplicate as model.output
  8695. cur = build_lora_mm(model.tok_embd, cur);
  8696. cb(cur, "result_output", -1);
  8697. res->t_logits = cur;
  8698. ggml_build_forward_expand(gf, cur);
  8699. }
  8700. };
  8701. struct llm_build_t5_enc : public llm_graph_context {
  8702. llm_build_t5_enc(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  8703. const int64_t n_embd_head = hparams.n_embd_head_v;
  8704. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8705. ggml_tensor * cur;
  8706. ggml_tensor * inpL;
  8707. inpL = build_inp_embd(model.tok_embd);
  8708. ggml_tensor * pos_bucket_enc = build_inp_pos_bucket_enc();
  8709. auto * inp_attn = build_attn_inp_no_cache();
  8710. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8711. for (int il = 0; il < n_layer; ++il) {
  8712. ggml_tensor * inpSA = inpL;
  8713. // norm
  8714. cur = build_norm(inpL,
  8715. model.layers[il].attn_norm_enc, NULL,
  8716. LLM_NORM_RMS, il);
  8717. cb(cur, "attn_norm", il);
  8718. // self-attention
  8719. {
  8720. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur);
  8721. cb(Qcur, "Qcur", il);
  8722. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur);
  8723. cb(Kcur, "Kcur", il);
  8724. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur);
  8725. cb(Vcur, "Vcur", il);
  8726. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8727. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8728. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8729. ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
  8730. ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b);
  8731. cur = build_attn(inp_attn, gf,
  8732. model.layers[il].wo_enc, nullptr,
  8733. Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
  8734. cb(cur, "kqv_out", il);
  8735. }
  8736. if (il == n_layer - 1 && inp_out_ids) {
  8737. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8738. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  8739. }
  8740. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  8741. cb(ffn_inp, "ffn_inp", il);
  8742. // feed-forward network
  8743. {
  8744. cur = build_norm(ffn_inp,
  8745. model.layers[il].ffn_norm_enc, NULL,
  8746. LLM_NORM_RMS, il);
  8747. cb(cur, "ffn_norm", il);
  8748. // T5 uses relu, flan-T5 uses gelu-gated
  8749. cur = build_ffn(cur,
  8750. model.layers[il].ffn_up_enc, NULL, NULL,
  8751. model.layers[il].ffn_gate_enc, NULL, NULL,
  8752. model.layers[il].ffn_down_enc, NULL, NULL,
  8753. NULL,
  8754. model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
  8755. model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
  8756. il);
  8757. cb(cur, "ffn_out", il);
  8758. }
  8759. cur = ggml_add(ctx0, cur, ffn_inp);
  8760. cb(cur, "ffn_out", il);
  8761. cur = build_cvec(cur, il);
  8762. cb(cur, "l_out", il);
  8763. // input for next layer
  8764. inpL = cur;
  8765. }
  8766. cur = inpL;
  8767. cb(cur, "result_embd", -1);
  8768. cur = build_norm(cur,
  8769. model.output_norm_enc, NULL,
  8770. LLM_NORM_RMS, -1);
  8771. cb(cur, "result_norm", -1);
  8772. res->t_embd = cur;
  8773. ggml_build_forward_expand(gf, cur);
  8774. }
  8775. };
  8776. struct llm_build_t5_dec : public llm_graph_context {
  8777. llm_build_t5_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  8778. const int64_t n_embd_head = hparams.n_embd_head_v;
  8779. //const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  8780. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8781. ggml_tensor * cur;
  8782. ggml_tensor * inpL;
  8783. inpL = build_inp_embd(model.tok_embd);
  8784. ggml_tensor * embd_enc = build_inp_cross_embd();
  8785. ggml_tensor * pos_bucket_dec = build_inp_pos_bucket_dec();
  8786. const int64_t n_outputs_enc = embd_enc->ne[1];
  8787. auto * inp_attn_self = build_attn_inp_kv_unified();
  8788. auto * inp_attn_cross = build_attn_inp_cross();
  8789. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8790. for (int il = 0; il < n_layer; ++il) {
  8791. ggml_tensor * inpSA = inpL;
  8792. // norm
  8793. cur = build_norm(inpL,
  8794. model.layers[il].attn_norm, NULL,
  8795. LLM_NORM_RMS, il);
  8796. cb(cur, "attn_norm", il);
  8797. // self-attention
  8798. {
  8799. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8800. cb(Qcur, "Qcur", il);
  8801. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8802. cb(Kcur, "Kcur", il);
  8803. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8804. cb(Vcur, "Vcur", il);
  8805. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8806. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8807. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8808. ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
  8809. ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b);
  8810. cur = build_attn(inp_attn_self, gf,
  8811. model.layers[il].wo, model.layers[il].bo,
  8812. Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
  8813. cb(cur, "kqv_out", il);
  8814. }
  8815. cur = ggml_add(ctx0, cur, inpSA);
  8816. cb(cur, "cross_inp", il);
  8817. ggml_tensor * inpCA = cur;
  8818. // norm
  8819. cur = build_norm(cur,
  8820. model.layers[il].attn_norm_cross, NULL,
  8821. LLM_NORM_RMS, il);
  8822. cb(cur, "attn_norm_cross", il);
  8823. // cross-attention
  8824. {
  8825. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur);
  8826. cb(Qcur, "Qcur", il);
  8827. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc);
  8828. cb(Kcur, "Kcur", il);
  8829. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc);
  8830. cb(Vcur, "Vcur", il);
  8831. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8832. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
  8833. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_outputs_enc);
  8834. cur = build_attn(inp_attn_cross, gf,
  8835. model.layers[il].wo_cross, nullptr,
  8836. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
  8837. cb(cur, "kqv_out", il);
  8838. //ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
  8839. //ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
  8840. //ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
  8841. //cb(kq, "kq", il);
  8842. //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
  8843. //cb(kq, "kq_soft_max_ext", il);
  8844. //ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
  8845. //cb(v, "v", il);
  8846. //ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
  8847. //cb(kqv, "kqv", il);
  8848. //ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
  8849. //cb(kqv_merged, "kqv_merged", il);
  8850. //cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
  8851. //cb(cur, "kqv_merged_cont", il);
  8852. //ggml_build_forward_expand(gf, cur);
  8853. //cur = build_lora_mm(model.layers[il].wo_cross, cur);
  8854. //cb(cur, "kqv_out", il);
  8855. }
  8856. if (il == n_layer - 1 && inp_out_ids) {
  8857. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8858. inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
  8859. }
  8860. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
  8861. cb(ffn_inp, "ffn_inp", il);
  8862. // feed-forward network
  8863. {
  8864. cur = build_norm(ffn_inp,
  8865. model.layers[il].ffn_norm, NULL,
  8866. LLM_NORM_RMS, il);
  8867. cb(cur, "ffn_norm", il);
  8868. // T5 uses relu, flan-T5 uses gelu-gated
  8869. cur = build_ffn(cur,
  8870. model.layers[il].ffn_up, NULL, NULL,
  8871. model.layers[il].ffn_gate, NULL, NULL,
  8872. model.layers[il].ffn_down, NULL, NULL,
  8873. NULL,
  8874. model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
  8875. model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
  8876. il);
  8877. cb(cur, "ffn_out", il);
  8878. }
  8879. cur = ggml_add(ctx0, cur, ffn_inp);
  8880. cb(cur, "ffn_out", il);
  8881. cur = build_cvec(cur, il);
  8882. cb(cur, "l_out", il);
  8883. // input for next layer
  8884. inpL = cur;
  8885. }
  8886. cur = inpL;
  8887. cb(cur, "result_embd", -1);
  8888. cur = build_norm(cur,
  8889. model.output_norm, NULL,
  8890. LLM_NORM_RMS, -1);
  8891. cb(cur, "result_norm", -1);
  8892. res->t_embd = cur;
  8893. // lm_head
  8894. cur = build_lora_mm(model.output, cur);
  8895. cb(cur, "result_output", -1);
  8896. res->t_logits = cur;
  8897. ggml_build_forward_expand(gf, cur);
  8898. }
  8899. };
  8900. struct llm_build_jais : public llm_graph_context {
  8901. llm_build_jais(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  8902. const int64_t n_embd_head = hparams.n_embd_head_v;
  8903. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  8904. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8905. ggml_tensor * cur;
  8906. ggml_tensor * inpL;
  8907. inpL = build_inp_embd(model.tok_embd);
  8908. auto * inp_attn = build_attn_inp_kv_unified();
  8909. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8910. for (int il = 0; il < n_layer; ++il) {
  8911. cur = build_norm(inpL,
  8912. model.layers[il].attn_norm,
  8913. model.layers[il].attn_norm_b,
  8914. LLM_NORM, il);
  8915. cb(cur, "attn_norm", il);
  8916. // self-attention
  8917. {
  8918. cur = build_lora_mm(model.layers[il].wqkv, cur);
  8919. cb(cur, "wqkv", il);
  8920. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  8921. cb(cur, "bqkv", il);
  8922. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd)));
  8923. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd)));
  8924. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa)));
  8925. cb(Qcur, "Qcur", il);
  8926. cb(Kcur, "Kcur", il);
  8927. cb(Vcur, "Vcur", il);
  8928. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8929. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8930. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8931. cur = build_attn(inp_attn, gf,
  8932. model.layers[il].wo, model.layers[il].bo,
  8933. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
  8934. }
  8935. if (il == n_layer - 1 && inp_out_ids) {
  8936. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8937. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  8938. }
  8939. // add the input
  8940. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  8941. cb(ffn_inp, "ffn_inp", il);
  8942. // FF
  8943. {
  8944. cur = build_norm(ffn_inp,
  8945. model.layers[il].ffn_norm,
  8946. model.layers[il].ffn_norm_b,
  8947. LLM_NORM, il);
  8948. cb(cur, "ffn_norm", il);
  8949. cur = build_ffn(cur,
  8950. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  8951. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  8952. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  8953. NULL,
  8954. LLM_FFN_SILU, LLM_FFN_PAR, il);
  8955. cb(cur, "ffn_out", il);
  8956. }
  8957. inpL = ggml_add(ctx0, cur, ffn_inp);
  8958. cb(inpL, "l_out", il);
  8959. }
  8960. cur = build_norm(inpL,
  8961. model.output_norm,
  8962. model.output_norm_b,
  8963. LLM_NORM, -1);
  8964. cb(cur, "result_norm", -1);
  8965. res->t_embd = cur;
  8966. cur = build_lora_mm(model.output, cur);
  8967. cb(cur, "result_output", -1);
  8968. res->t_logits = cur;
  8969. ggml_build_forward_expand(gf, cur);
  8970. }
  8971. };
  8972. struct llm_build_chatglm : public llm_graph_context {
  8973. llm_build_chatglm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  8974. const int64_t n_embd_head = hparams.n_embd_head_v;
  8975. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  8976. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8977. ggml_tensor * cur;
  8978. ggml_tensor * inpL;
  8979. inpL = build_inp_embd(model.tok_embd);
  8980. // inp_pos - contains the positions
  8981. ggml_tensor * inp_pos = build_inp_pos();
  8982. auto * inp_attn = build_attn_inp_kv_unified();
  8983. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8984. for (int il = 0; il < n_layer; ++il) {
  8985. ggml_tensor * inpSA = inpL;
  8986. cur = build_norm(inpL,
  8987. model.layers[il].attn_norm,
  8988. NULL,
  8989. LLM_NORM_RMS, il);
  8990. cb(cur, "attn_norm", il);
  8991. // self-attention
  8992. {
  8993. ggml_tensor * Qcur = nullptr;
  8994. ggml_tensor * Kcur = nullptr;
  8995. ggml_tensor * Vcur = nullptr;
  8996. if (model.layers[il].wqkv == nullptr) {
  8997. Qcur = build_lora_mm(model.layers[il].wq, cur);
  8998. if (model.layers[il].bq) {
  8999. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  9000. }
  9001. Kcur = build_lora_mm(model.layers[il].wk, cur);
  9002. if (model.layers[il].bk) {
  9003. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  9004. }
  9005. Vcur = build_lora_mm(model.layers[il].wv, cur);
  9006. if (model.layers[il].bv) {
  9007. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  9008. }
  9009. } else {
  9010. cur = build_lora_mm(model.layers[il].wqkv, cur);
  9011. cb(cur, "wqkv", il);
  9012. if (model.layers[il].bqkv) {
  9013. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  9014. cb(cur, "bqkv", il);
  9015. }
  9016. Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  9017. Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  9018. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  9019. }
  9020. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9021. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9022. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9023. //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
  9024. Qcur = ggml_rope_ext(
  9025. ctx0, Qcur, inp_pos, nullptr,
  9026. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9027. ext_factor, attn_factor, beta_fast, beta_slow
  9028. );
  9029. Kcur = ggml_rope_ext(
  9030. ctx0, Kcur, inp_pos, nullptr,
  9031. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9032. ext_factor, attn_factor, beta_fast, beta_slow
  9033. );
  9034. cb(Qcur, "Qcur", il);
  9035. cb(Kcur, "Kcur", il);
  9036. cb(Vcur, "Vcur", il);
  9037. cur = build_attn(inp_attn, gf,
  9038. model.layers[il].wo, NULL,
  9039. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  9040. }
  9041. if (il == n_layer - 1 && inp_out_ids) {
  9042. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9043. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9044. }
  9045. // Add the input
  9046. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9047. cb(ffn_inp, "ffn_inp", il);
  9048. // FF
  9049. {
  9050. cur = build_norm(ffn_inp,
  9051. model.layers[il].ffn_norm,
  9052. NULL,
  9053. LLM_NORM_RMS, il);
  9054. cb(cur, "ffn_norm", il);
  9055. cur = build_ffn(cur,
  9056. model.layers[il].ffn_up, NULL, NULL,
  9057. NULL, NULL, NULL,
  9058. model.layers[il].ffn_down, NULL, NULL,
  9059. NULL,
  9060. LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
  9061. cb(cur, "ffn_out", il);
  9062. }
  9063. inpL = ggml_add(ctx0, cur, ffn_inp);
  9064. cb(inpL, "l_out", il);
  9065. }
  9066. cur = build_norm(inpL,
  9067. model.output_norm,
  9068. NULL,
  9069. LLM_NORM_RMS, -1);
  9070. cb(cur, "result_norm", -1);
  9071. res->t_embd = cur;
  9072. cur = build_lora_mm(model.output, cur);
  9073. cb(cur, "result_output", -1);
  9074. res->t_logits = cur;
  9075. ggml_build_forward_expand(gf, cur);
  9076. }
  9077. };
  9078. struct llm_build_glm4 : public llm_graph_context {
  9079. llm_build_glm4(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  9080. const int64_t n_embd_head = hparams.n_embd_head_v;
  9081. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  9082. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9083. ggml_tensor * cur;
  9084. ggml_tensor * inpL;
  9085. inpL = build_inp_embd(model.tok_embd);
  9086. // inp_pos - contains the positions
  9087. ggml_tensor * inp_pos = build_inp_pos();
  9088. auto * inp_attn = build_attn_inp_kv_unified();
  9089. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9090. for (int il = 0; il < n_layer; ++il) {
  9091. ggml_tensor * inpSA = inpL;
  9092. // Pre-attention norm
  9093. cur = build_norm(inpL,
  9094. model.layers[il].attn_norm,
  9095. NULL,
  9096. LLM_NORM_RMS, il);
  9097. cb(cur, "attn_norm", il);
  9098. // self-attention
  9099. {
  9100. ggml_tensor * Qcur = nullptr;
  9101. ggml_tensor * Kcur = nullptr;
  9102. ggml_tensor * Vcur = nullptr;
  9103. if (model.layers[il].wqkv == nullptr) {
  9104. Qcur = build_lora_mm(model.layers[il].wq, cur);
  9105. if (model.layers[il].bq) {
  9106. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  9107. }
  9108. Kcur = build_lora_mm(model.layers[il].wk, cur);
  9109. if (model.layers[il].bk) {
  9110. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  9111. }
  9112. Vcur = build_lora_mm(model.layers[il].wv, cur);
  9113. if (model.layers[il].bv) {
  9114. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  9115. }
  9116. } else {
  9117. cur = build_lora_mm(model.layers[il].wqkv, cur);
  9118. cb(cur, "wqkv", il);
  9119. if (model.layers[il].bqkv) {
  9120. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  9121. cb(cur, "bqkv", il);
  9122. }
  9123. Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  9124. Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  9125. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  9126. }
  9127. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9128. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9129. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9130. Qcur = ggml_rope_ext(
  9131. ctx0, Qcur, inp_pos, nullptr,
  9132. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9133. ext_factor, attn_factor, beta_fast, beta_slow
  9134. );
  9135. Kcur = ggml_rope_ext(
  9136. ctx0, Kcur, inp_pos, nullptr,
  9137. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9138. ext_factor, attn_factor, beta_fast, beta_slow
  9139. );
  9140. cb(Qcur, "Qcur", il);
  9141. cb(Kcur, "Kcur", il);
  9142. cb(Vcur, "Vcur", il);
  9143. cur = build_attn(inp_attn, gf,
  9144. model.layers[il].wo, NULL,
  9145. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  9146. }
  9147. if (il == n_layer - 1 && inp_out_ids) {
  9148. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9149. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9150. }
  9151. // Post-attention norm (new!)
  9152. cur = build_norm(cur,
  9153. model.layers[il].attn_post_norm,
  9154. NULL,
  9155. LLM_NORM_RMS, il);
  9156. cb(cur, "post_attn_norm", il);
  9157. // Add the input (residual connection after post-attention norm)
  9158. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9159. cb(ffn_inp, "ffn_inp", il);
  9160. // FF
  9161. {
  9162. // Pre-MLP norm
  9163. cur = build_norm(ffn_inp,
  9164. model.layers[il].ffn_norm,
  9165. NULL,
  9166. LLM_NORM_RMS, il);
  9167. cb(cur, "ffn_norm", il);
  9168. // MLP
  9169. cur = build_ffn(cur,
  9170. model.layers[il].ffn_up, NULL, NULL,
  9171. NULL, NULL, NULL,
  9172. model.layers[il].ffn_down, NULL, NULL,
  9173. NULL,
  9174. LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
  9175. cb(cur, "ffn_out", il);
  9176. // Post-MLP norm
  9177. cur = build_norm(cur,
  9178. model.layers[il].ffn_post_norm,
  9179. NULL,
  9180. LLM_NORM_RMS, il);
  9181. cb(cur, "post_mlp_norm", il);
  9182. }
  9183. // Add residual connection after post-MLP norm
  9184. inpL = ggml_add(ctx0, cur, ffn_inp);
  9185. cb(inpL, "l_out", il);
  9186. }
  9187. // Final norm
  9188. cur = build_norm(inpL,
  9189. model.output_norm,
  9190. NULL,
  9191. LLM_NORM_RMS, -1);
  9192. cb(cur, "result_norm", -1);
  9193. res->t_embd = cur;
  9194. // Output projection
  9195. cur = build_lora_mm(model.output, cur);
  9196. cb(cur, "result_output", -1);
  9197. res->t_logits = cur;
  9198. ggml_build_forward_expand(gf, cur);
  9199. }
  9200. };
  9201. struct llm_build_nemotron : public llm_graph_context {
  9202. llm_build_nemotron(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  9203. const int64_t n_embd_head = hparams.n_embd_head_v;
  9204. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9205. //GGML_ASSERT(n_embd_head == hparams.n_rot);
  9206. ggml_tensor * cur;
  9207. ggml_tensor * inpL;
  9208. inpL = build_inp_embd(model.tok_embd);
  9209. // inp_pos - contains the positions
  9210. ggml_tensor * inp_pos = build_inp_pos();
  9211. auto * inp_attn = build_attn_inp_kv_unified();
  9212. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9213. for (int il = 0; il < n_layer; ++il) {
  9214. ggml_tensor * inpSA = inpL;
  9215. // norm
  9216. cur = build_norm(inpL,
  9217. model.layers[il].attn_norm,
  9218. model.layers[il].attn_norm_b,
  9219. LLM_NORM, il);
  9220. cb(cur, "attn_norm", il);
  9221. // self-attention
  9222. {
  9223. // compute Q and K and RoPE them
  9224. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  9225. cb(Qcur, "Qcur", il);
  9226. if (model.layers[il].bq) {
  9227. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  9228. cb(Qcur, "Qcur", il);
  9229. }
  9230. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  9231. cb(Kcur, "Kcur", il);
  9232. if (model.layers[il].bk) {
  9233. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  9234. cb(Kcur, "Kcur", il);
  9235. }
  9236. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  9237. cb(Vcur, "Vcur", il);
  9238. if (model.layers[il].bv) {
  9239. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  9240. cb(Vcur, "Vcur", il);
  9241. }
  9242. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9243. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9244. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9245. Qcur = ggml_rope_ext(
  9246. ctx0, Qcur, inp_pos, nullptr,
  9247. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9248. ext_factor, attn_factor, beta_fast, beta_slow
  9249. );
  9250. Kcur = ggml_rope_ext(
  9251. ctx0, Kcur, inp_pos, nullptr,
  9252. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9253. ext_factor, attn_factor, beta_fast, beta_slow
  9254. );
  9255. cb(Qcur, "Qcur", il);
  9256. cb(Kcur, "Kcur", il);
  9257. cb(Vcur, "Vcur", il);
  9258. cur = build_attn(inp_attn, gf,
  9259. model.layers[il].wo, model.layers[il].bo,
  9260. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  9261. }
  9262. if (il == n_layer - 1 && inp_out_ids) {
  9263. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9264. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9265. }
  9266. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9267. cb(ffn_inp, "ffn_inp", il);
  9268. // feed-forward network
  9269. cur = build_norm(ffn_inp,
  9270. model.layers[il].ffn_norm,
  9271. model.layers[il].ffn_norm_b,
  9272. LLM_NORM, il);
  9273. cb(cur, "ffn_norm", il);
  9274. cur = build_ffn(cur,
  9275. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  9276. NULL, NULL, NULL,
  9277. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  9278. NULL,
  9279. LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
  9280. cur = ggml_add(ctx0, cur, ffn_inp);
  9281. cb(cur, "ffn_out", il);
  9282. cur = build_cvec(cur, il);
  9283. cb(cur, "l_out", il);
  9284. // input for next layer
  9285. inpL = cur;
  9286. }
  9287. cur = inpL;
  9288. cur = build_norm(cur,
  9289. model.output_norm, model.output_norm_b,
  9290. LLM_NORM, -1);
  9291. cb(cur, "result_norm", -1);
  9292. res->t_embd = cur;
  9293. // lm_head
  9294. cur = build_lora_mm(model.output, cur);
  9295. cb(cur, "result_output", -1);
  9296. res->t_logits = cur;
  9297. ggml_build_forward_expand(gf, cur);
  9298. }
  9299. };
  9300. struct llm_build_exaone : public llm_graph_context {
  9301. llm_build_exaone(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  9302. const int64_t n_embd_head = hparams.n_embd_head_v;
  9303. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9304. GGML_ASSERT(n_embd_head == hparams.n_rot);
  9305. ggml_tensor * cur;
  9306. ggml_tensor * inpL;
  9307. inpL = build_inp_embd(model.tok_embd);
  9308. // inp_pos - contains the positions
  9309. ggml_tensor * inp_pos = build_inp_pos();
  9310. auto * inp_attn = build_attn_inp_kv_unified();
  9311. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9312. for (int il = 0; il < n_layer; ++il) {
  9313. ggml_tensor * inpSA = inpL;
  9314. // norm
  9315. cur = build_norm(inpL,
  9316. model.layers[il].attn_norm, NULL,
  9317. LLM_NORM_RMS, il);
  9318. cb(cur, "attn_norm", il);
  9319. // self-attention
  9320. {
  9321. // rope freq factors for llama3; may return nullptr for llama2 and other models
  9322. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  9323. // compute Q and K and RoPE them
  9324. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  9325. cb(Qcur, "Qcur", il);
  9326. if (model.layers[il].bq) {
  9327. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  9328. cb(Qcur, "Qcur", il);
  9329. }
  9330. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  9331. cb(Kcur, "Kcur", il);
  9332. if (model.layers[il].bk) {
  9333. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  9334. cb(Kcur, "Kcur", il);
  9335. }
  9336. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  9337. cb(Vcur, "Vcur", il);
  9338. if (model.layers[il].bv) {
  9339. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  9340. cb(Vcur, "Vcur", il);
  9341. }
  9342. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9343. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9344. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9345. Qcur = ggml_rope_ext(
  9346. ctx0, Qcur, inp_pos, rope_factors,
  9347. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9348. ext_factor, attn_factor, beta_fast, beta_slow
  9349. );
  9350. Kcur = ggml_rope_ext(
  9351. ctx0, Kcur, inp_pos, rope_factors,
  9352. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9353. ext_factor, attn_factor, beta_fast, beta_slow
  9354. );
  9355. cb(Qcur, "Qcur", il);
  9356. cb(Kcur, "Kcur", il);
  9357. cb(Vcur, "Vcur", il);
  9358. cur = build_attn(inp_attn, gf,
  9359. model.layers[il].wo, model.layers[il].bo,
  9360. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  9361. }
  9362. if (il == n_layer - 1 && inp_out_ids) {
  9363. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9364. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9365. }
  9366. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9367. cb(ffn_inp, "ffn_inp", il);
  9368. // feed-forward network
  9369. cur = build_norm(ffn_inp,
  9370. model.layers[il].ffn_norm, NULL,
  9371. LLM_NORM_RMS, il);
  9372. cb(cur, "ffn_norm", il);
  9373. cur = build_ffn(cur,
  9374. model.layers[il].ffn_up, NULL, NULL,
  9375. model.layers[il].ffn_gate, NULL, NULL,
  9376. model.layers[il].ffn_down, NULL, NULL,
  9377. NULL,
  9378. LLM_FFN_SILU, LLM_FFN_PAR, il);
  9379. cb(cur, "ffn_out", il);
  9380. cur = ggml_add(ctx0, cur, ffn_inp);
  9381. cb(cur, "ffn_out", il);
  9382. cur = build_cvec(cur, il);
  9383. cb(cur, "l_out", il);
  9384. // input for next layer
  9385. inpL = cur;
  9386. }
  9387. cur = inpL;
  9388. cur = build_norm(cur,
  9389. model.output_norm, NULL,
  9390. LLM_NORM_RMS, -1);
  9391. cb(cur, "result_norm", -1);
  9392. res->t_embd = cur;
  9393. // lm_head
  9394. cur = build_lora_mm(model.output, cur);
  9395. cb(cur, "result_output", -1);
  9396. res->t_logits = cur;
  9397. ggml_build_forward_expand(gf, cur);
  9398. }
  9399. };
  9400. struct llm_build_rwkv6_base : public llm_graph_context {
  9401. const llama_model & model;
  9402. llm_build_rwkv6_base(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
  9403. }
  9404. ggml_tensor * build_rwkv6_channel_mix(
  9405. const llama_layer * layer,
  9406. ggml_tensor * cur,
  9407. ggml_tensor * x_prev,
  9408. llm_arch arch) const {
  9409. ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
  9410. switch (arch) {
  9411. case LLM_ARCH_RWKV6:
  9412. {
  9413. ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
  9414. ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur);
  9415. ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr));
  9416. ggml_tensor * k = ggml_sqr(
  9417. ctx0,
  9418. ggml_relu(
  9419. ctx0,
  9420. build_lora_mm(layer->channel_mix_key, xk)
  9421. )
  9422. );
  9423. cur = ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k));
  9424. } break;
  9425. default:
  9426. GGML_ABORT("fatal error");
  9427. }
  9428. return cur;
  9429. }
  9430. ggml_tensor * build_rwkv6_time_mix(
  9431. llm_graph_input_rs * inp,
  9432. ggml_cgraph * gf,
  9433. ggml_tensor * cur,
  9434. ggml_tensor * x_prev,
  9435. const llama_ubatch & ubatch,
  9436. int il) const {
  9437. const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
  9438. const auto n_tokens = ubatch.n_tokens;
  9439. const auto n_seqs = ubatch.n_seqs;
  9440. const auto n_seq_tokens = ubatch.n_seq_tokens;
  9441. const auto n_embd = hparams.n_embd;
  9442. const auto head_size = hparams.wkv_head_size;
  9443. const auto n_head = n_embd / head_size;
  9444. const auto n_head_kv = hparams.n_head_kv(il);
  9445. const auto kv_head = mctx_cur->get_head();
  9446. const auto & layer = model.layers[il];
  9447. bool is_qrwkv = layer.time_mix_first == nullptr;
  9448. ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
  9449. sx = ggml_reshape_2d(ctx0, sx, n_embd, n_tokens);
  9450. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  9451. ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_x), cur);
  9452. xxx = ggml_reshape_4d(
  9453. ctx0,
  9454. ggml_tanh(
  9455. ctx0,
  9456. ggml_mul_mat(ctx0, layer.time_mix_w1, xxx)
  9457. ),
  9458. layer.time_mix_w1->ne[1] / 5, 1, 5, n_tokens
  9459. );
  9460. xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2));
  9461. xxx = ggml_mul_mat(
  9462. ctx0,
  9463. ggml_reshape_4d(
  9464. ctx0,
  9465. layer.time_mix_w2,
  9466. layer.time_mix_w2->ne[0], layer.time_mix_w2->ne[1], 1, 5
  9467. ),
  9468. xxx
  9469. );
  9470. ggml_tensor *xw, *xk, *xv, *xr, *xg;
  9471. if (layer.time_mix_lerp_fused) {
  9472. // fusing these weights makes some performance improvement
  9473. sx = ggml_reshape_3d(ctx0, sx, n_embd, 1, n_tokens);
  9474. cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
  9475. xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer.time_mix_lerp_fused), sx), cur);
  9476. xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
  9477. xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
  9478. xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
  9479. xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
  9480. xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
  9481. } else {
  9482. // for backward compatibility
  9483. xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
  9484. xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
  9485. xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
  9486. xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
  9487. xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
  9488. xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer.time_mix_lerp_w), sx), cur);
  9489. xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer.time_mix_lerp_k), sx), cur);
  9490. xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer.time_mix_lerp_v), sx), cur);
  9491. xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer.time_mix_lerp_r), sx), cur);
  9492. xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer.time_mix_lerp_g), sx), cur);
  9493. }
  9494. ggml_tensor * r = build_lora_mm(layer.time_mix_receptance, xr);
  9495. ggml_tensor * k = build_lora_mm(layer.time_mix_key, xk);
  9496. ggml_tensor * v = build_lora_mm(layer.time_mix_value, xv);
  9497. if (layer.time_mix_receptance_b) {
  9498. r = ggml_add(ctx0, r, layer.time_mix_receptance_b);
  9499. }
  9500. if (layer.time_mix_key_b) {
  9501. k = ggml_add(ctx0, k, layer.time_mix_key_b);
  9502. }
  9503. if (layer.time_mix_value_b) {
  9504. v = ggml_add(ctx0, v, layer.time_mix_value_b);
  9505. }
  9506. ggml_tensor * g = build_lora_mm(layer.time_mix_gate, xg);
  9507. if (is_qrwkv) {
  9508. g = ggml_sigmoid(ctx0, g);
  9509. } else {
  9510. g = ggml_silu(ctx0, g);
  9511. }
  9512. if (n_head_kv != 0 && n_head_kv != n_head) {
  9513. GGML_ASSERT(n_head % n_head_kv == 0);
  9514. k = ggml_reshape_4d(ctx0, k, head_size, 1, n_head_kv, n_tokens);
  9515. v = ggml_reshape_4d(ctx0, v, head_size, 1, n_head_kv, n_tokens);
  9516. ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, n_head / n_head_kv, n_head_kv, n_tokens);
  9517. k = ggml_repeat(ctx0, k, tmp);
  9518. v = ggml_repeat(ctx0, v, tmp);
  9519. }
  9520. k = ggml_reshape_3d(ctx0, k, head_size, n_head, n_tokens);
  9521. v = ggml_reshape_3d(ctx0, v, head_size, n_head, n_tokens);
  9522. r = ggml_reshape_3d(ctx0, r, head_size, n_head, n_tokens);
  9523. ggml_tensor * w = ggml_mul_mat(
  9524. ctx0,
  9525. layer.time_mix_decay_w2,
  9526. ggml_tanh(
  9527. ctx0,
  9528. ggml_mul_mat(ctx0, layer.time_mix_decay_w1, xw)
  9529. )
  9530. );
  9531. w = ggml_add(ctx0, w, layer.time_mix_decay);
  9532. w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w)));
  9533. w = ggml_reshape_3d(ctx0, w, head_size, n_head, n_tokens);
  9534. if (is_qrwkv) {
  9535. // k = k * (1 - w)
  9536. k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
  9537. }
  9538. ggml_tensor * wkv_state = build_rs(
  9539. inp, gf, mctx_cur->get_s_l(il),
  9540. hparams.n_embd_s(), n_seqs);
  9541. ggml_tensor * wkv_output;
  9542. if (is_qrwkv) {
  9543. wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, wkv_state, pow(head_size, -0.5f));
  9544. } else {
  9545. wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer.time_mix_first, w, wkv_state);
  9546. }
  9547. cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
  9548. wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
  9549. ggml_build_forward_expand(
  9550. gf,
  9551. ggml_cpy(
  9552. ctx0,
  9553. wkv_state,
  9554. ggml_view_1d(
  9555. ctx0,
  9556. mctx_cur->get_s_l(il),
  9557. hparams.n_embd_s() * n_seqs,
  9558. hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
  9559. )
  9560. )
  9561. );
  9562. if (!is_qrwkv) {
  9563. // group norm with head_count groups
  9564. cur = ggml_reshape_3d(ctx0, cur, n_embd / n_head, n_head, n_tokens);
  9565. cur = ggml_norm(ctx0, cur, 64e-5f);
  9566. // Convert back to regular vectors.
  9567. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  9568. cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.time_mix_ln), layer.time_mix_ln_b);
  9569. } else {
  9570. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  9571. }
  9572. cur = ggml_mul(ctx0, cur, g);
  9573. cur = build_lora_mm(layer.time_mix_output, cur);
  9574. return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
  9575. }
  9576. };
  9577. struct llm_build_rwkv6 : public llm_build_rwkv6_base {
  9578. llm_build_rwkv6(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) {
  9579. GGML_ASSERT(hparams.token_shift_count == 2);
  9580. ggml_tensor * cur;
  9581. ggml_tensor * inpL;
  9582. inpL = build_inp_embd(model.tok_embd);
  9583. inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
  9584. auto * rs_inp = build_rs_inp();
  9585. const auto n_embd = hparams.n_embd;
  9586. const auto n_seq_tokens = ubatch.n_seq_tokens;
  9587. const auto n_seqs = ubatch.n_seqs;
  9588. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9589. for (int il = 0; il < n_layer; ++il) {
  9590. const llama_layer * layer = &model.layers[il];
  9591. inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
  9592. ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
  9593. ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
  9594. ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
  9595. ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
  9596. cb(att_norm, "attn_norm", il);
  9597. ggml_tensor * x_prev = ggml_concat(
  9598. ctx0,
  9599. att_shift,
  9600. ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
  9601. 1
  9602. );
  9603. cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
  9604. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  9605. cb(ffn_inp, "ffn_inp", il);
  9606. ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
  9607. cb(ffn_norm, "ffn_norm", il);
  9608. x_prev = ggml_concat(
  9609. ctx0,
  9610. ffn_shift,
  9611. ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0),
  9612. 1
  9613. );
  9614. token_shift = ggml_concat(ctx0,
  9615. ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)),
  9616. ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
  9617. 1
  9618. );
  9619. ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
  9620. ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
  9621. ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
  9622. x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
  9623. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  9624. if (il == n_layer - 1 && inp_out_ids) {
  9625. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  9626. ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
  9627. x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
  9628. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9629. }
  9630. cur = build_rwkv6_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6);
  9631. cur = ggml_add(ctx0, cur, ffn_inp);
  9632. if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
  9633. cur = ggml_scale(ctx0, cur, 0.5F);
  9634. }
  9635. cur = build_cvec(cur, il);
  9636. cb(cur, "l_out", il);
  9637. // input for next layer
  9638. inpL = cur;
  9639. }
  9640. cur = inpL;
  9641. cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
  9642. cb(cur, "result_norm", -1);
  9643. res->t_embd = cur;
  9644. cur = build_lora_mm(model.output, cur);
  9645. cb(cur, "result_output", -1);
  9646. res->t_logits = cur;
  9647. ggml_build_forward_expand(gf, cur);
  9648. }
  9649. };
  9650. // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
  9651. struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
  9652. llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) {
  9653. GGML_ASSERT(n_embd == hparams.n_embd_r());
  9654. ggml_tensor * cur;
  9655. ggml_tensor * inpL;
  9656. inpL = build_inp_embd(model.tok_embd);
  9657. auto * rs_inp = build_rs_inp();
  9658. const auto n_embd = hparams.n_embd;
  9659. const auto n_seq_tokens = ubatch.n_seq_tokens;
  9660. const auto n_seqs = ubatch.n_seqs;
  9661. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9662. for (int il = 0; il < n_layer; ++il) {
  9663. const llama_layer * layer = &model.layers[il];
  9664. inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
  9665. ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
  9666. ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
  9667. cb(att_norm, "attn_norm", il);
  9668. ggml_tensor * x_prev = ggml_concat(
  9669. ctx0,
  9670. token_shift,
  9671. ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
  9672. 1
  9673. );
  9674. cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
  9675. token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
  9676. ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
  9677. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  9678. cb(ffn_inp, "ffn_inp", il);
  9679. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  9680. ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
  9681. if (il == n_layer - 1 && inp_out_ids) {
  9682. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9683. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  9684. }
  9685. // feed-forward network
  9686. cur = build_norm(ffn_inp,
  9687. model.layers[il].ffn_norm, NULL,
  9688. LLM_NORM_RMS, il);
  9689. cb(cur, "ffn_norm", il);
  9690. cur = build_ffn(cur,
  9691. model.layers[il].ffn_up, NULL, NULL,
  9692. model.layers[il].ffn_gate, NULL, NULL,
  9693. model.layers[il].ffn_down, NULL, NULL,
  9694. NULL,
  9695. LLM_FFN_SILU, LLM_FFN_PAR, il);
  9696. cb(cur, "ffn_out", il);
  9697. cur = ggml_add(ctx0, cur, ffn_inp);
  9698. cur = build_cvec(cur, il);
  9699. cb(cur, "l_out", il);
  9700. // input for next layer
  9701. inpL = cur;
  9702. }
  9703. cur = inpL;
  9704. cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
  9705. cb(cur, "result_norm", -1);
  9706. res->t_embd = cur;
  9707. cur = build_lora_mm(model.output, cur);
  9708. cb(cur, "result_output", -1);
  9709. res->t_logits = cur;
  9710. ggml_build_forward_expand(gf, cur);
  9711. }
  9712. };
  9713. struct llm_build_rwkv7_base : public llm_graph_context {
  9714. const llama_model & model;
  9715. llm_build_rwkv7_base(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
  9716. }
  9717. ggml_tensor * build_rwkv7_channel_mix(
  9718. const llama_layer * layer,
  9719. ggml_tensor * cur,
  9720. ggml_tensor * x_prev,
  9721. llm_arch arch) const {
  9722. ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
  9723. switch (arch) {
  9724. case LLM_ARCH_RWKV7:
  9725. {
  9726. ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
  9727. ggml_tensor * k = ggml_sqr(
  9728. ctx0,
  9729. ggml_relu(
  9730. ctx0,
  9731. build_lora_mm(layer->channel_mix_key, xk)
  9732. )
  9733. );
  9734. cur = build_lora_mm(layer->channel_mix_value, k);
  9735. } break;
  9736. default:
  9737. GGML_ABORT("fatal error");
  9738. }
  9739. return cur;
  9740. }
  9741. ggml_tensor * build_rwkv7_time_mix(
  9742. llm_graph_input_rs * inp,
  9743. ggml_cgraph * gf,
  9744. ggml_tensor * cur,
  9745. ggml_tensor * x_prev,
  9746. ggml_tensor *& first_layer_value,
  9747. const llama_ubatch & ubatch,
  9748. int il) const {
  9749. const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
  9750. const auto n_tokens = ubatch.n_tokens;
  9751. const auto n_seqs = ubatch.n_seqs;
  9752. const auto n_embd = hparams.n_embd;
  9753. const auto head_size = hparams.wkv_head_size;
  9754. const auto head_count = n_embd / head_size;
  9755. const auto n_seq_tokens = ubatch.n_seq_tokens;
  9756. const auto kv_head = mctx_cur->get_head();
  9757. const auto & layer = model.layers[il];
  9758. bool has_gating = layer.time_mix_g1 && layer.time_mix_g2;
  9759. ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
  9760. ggml_tensor * dummy = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_embd, n_seq_tokens, n_seqs, has_gating ? 6 : 5);
  9761. sx = ggml_repeat(ctx0, sx, dummy);
  9762. ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_fused), cur);
  9763. ggml_tensor * xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
  9764. ggml_tensor * xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
  9765. ggml_tensor * xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
  9766. ggml_tensor * xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
  9767. ggml_tensor * xa = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
  9768. ggml_tensor * xg = has_gating ? ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 5 * sizeof(float)) : nullptr;
  9769. ggml_tensor * r = build_lora_mm(layer.time_mix_receptance, xr);
  9770. ggml_tensor * w = ggml_add(
  9771. ctx0,
  9772. ggml_mul_mat(ctx0, layer.time_mix_w2, ggml_tanh(ctx0, ggml_mul_mat(ctx0, layer.time_mix_w1, xw))),
  9773. layer.time_mix_w0
  9774. );
  9775. w = ggml_exp(ctx0, ggml_scale(ctx0, ggml_sigmoid(ctx0, w), -0.606531));
  9776. ggml_tensor * k = build_lora_mm(layer.time_mix_key, xk);
  9777. ggml_tensor * v = build_lora_mm(layer.time_mix_value, xv);
  9778. if (first_layer_value == nullptr) {
  9779. first_layer_value = v;
  9780. } else {
  9781. // Add the first layer value as a residual connection.
  9782. v = ggml_add(ctx0, v,
  9783. ggml_mul(ctx0,
  9784. ggml_sub(ctx0, first_layer_value, v),
  9785. ggml_sigmoid(ctx0, ggml_add(ctx0,
  9786. ggml_mul_mat(ctx0, layer.time_mix_v2, ggml_mul_mat(ctx0, layer.time_mix_v1, xv)),
  9787. layer.time_mix_v0
  9788. )
  9789. )
  9790. )
  9791. );
  9792. }
  9793. ggml_tensor * g = nullptr;
  9794. if (layer.time_mix_g1 && layer.time_mix_g2) {
  9795. g = ggml_mul_mat(ctx0, layer.time_mix_g2, ggml_sigmoid(ctx0, ggml_mul_mat(ctx0, layer.time_mix_g1, xg)));
  9796. }
  9797. ggml_tensor * a = ggml_sigmoid(ctx0,
  9798. ggml_add(
  9799. ctx0,
  9800. ggml_mul_mat(ctx0, layer.time_mix_a2, ggml_mul_mat(ctx0, layer.time_mix_a1, xa)),
  9801. layer.time_mix_a0
  9802. )
  9803. );
  9804. ggml_tensor * kk = ggml_reshape_3d(ctx0, ggml_mul(ctx0, k, layer.time_mix_k_k), head_size, head_count, n_tokens);
  9805. kk = ggml_l2_norm(ctx0, kk, 1e-12);
  9806. ggml_tensor * ka = ggml_mul(ctx0, k, layer.time_mix_k_a);
  9807. k = ggml_add(ctx0, k, ggml_sub(ctx0, ggml_mul(ctx0, a, ka), ka));
  9808. r = ggml_reshape_3d(ctx0, r, head_size, head_count, n_tokens);
  9809. w = ggml_reshape_3d(ctx0, w, head_size, head_count, n_tokens);
  9810. k = ggml_reshape_3d(ctx0, k, head_size, head_count, n_tokens);
  9811. v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
  9812. a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
  9813. ggml_tensor * wkv_state = build_rs(
  9814. inp, gf, mctx_cur->get_s_l(il),
  9815. hparams.n_embd_s(), n_seqs);
  9816. ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
  9817. cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
  9818. wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
  9819. ggml_build_forward_expand(
  9820. gf,
  9821. ggml_cpy(
  9822. ctx0,
  9823. wkv_state,
  9824. ggml_view_1d(
  9825. ctx0,
  9826. mctx_cur->get_s_l(il),
  9827. hparams.n_embd_s() * n_seqs,
  9828. hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
  9829. )
  9830. )
  9831. );
  9832. if (layer.time_mix_ln && layer.time_mix_ln_b) {
  9833. // group norm with head_count groups
  9834. cur = ggml_reshape_3d(ctx0, cur, n_embd / head_count, head_count, n_tokens);
  9835. cur = ggml_norm(ctx0, cur, 64e-5f);
  9836. // Convert back to regular vectors.
  9837. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  9838. cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.time_mix_ln), layer.time_mix_ln_b);
  9839. } else {
  9840. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  9841. }
  9842. ggml_tensor * rk = ggml_sum_rows(ctx0,
  9843. ggml_mul(ctx0, ggml_mul(ctx0, k, r), ggml_reshape_2d(ctx0, layer.time_mix_r_k, head_size, head_count)));
  9844. cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, ggml_mul(ctx0, v, rk), n_embd, n_tokens));
  9845. if (has_gating) {
  9846. cur = ggml_mul(ctx0, cur, g);
  9847. }
  9848. cur = build_lora_mm(layer.time_mix_output, cur);
  9849. return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
  9850. }
  9851. };
  9852. struct llm_build_rwkv7 : public llm_build_rwkv7_base {
  9853. llm_build_rwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) {
  9854. GGML_ASSERT(hparams.token_shift_count == 2);
  9855. ggml_tensor * cur;
  9856. ggml_tensor * inpL;
  9857. ggml_tensor * v_first = nullptr;
  9858. inpL = build_inp_embd(model.tok_embd);
  9859. inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
  9860. auto * rs_inp = build_rs_inp();
  9861. const auto n_embd = hparams.n_embd;
  9862. const auto n_seq_tokens = ubatch.n_seq_tokens;
  9863. const auto n_seqs = ubatch.n_seqs;
  9864. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9865. for (int il = 0; il < n_layer; ++il) {
  9866. const llama_layer * layer = &model.layers[il];
  9867. inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
  9868. ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
  9869. ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
  9870. ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
  9871. ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
  9872. cb(att_norm, "attn_norm", il);
  9873. ggml_tensor * x_prev = ggml_concat(
  9874. ctx0,
  9875. att_shift,
  9876. ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
  9877. 1
  9878. );
  9879. cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
  9880. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  9881. cb(ffn_inp, "ffn_inp", il);
  9882. ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
  9883. cb(ffn_norm, "ffn_norm", il);
  9884. x_prev = ggml_concat(
  9885. ctx0,
  9886. ffn_shift,
  9887. ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0),
  9888. 1
  9889. );
  9890. token_shift = ggml_concat(ctx0,
  9891. ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)),
  9892. ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
  9893. 1
  9894. );
  9895. ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
  9896. ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
  9897. ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
  9898. x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
  9899. if (il == n_layer - 1 && inp_out_ids) {
  9900. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  9901. ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
  9902. x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
  9903. }
  9904. cur = build_rwkv7_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV7);
  9905. cur = ggml_add(ctx0, cur, ffn_inp);
  9906. cur = build_cvec(cur, il);
  9907. cb(cur, "l_out", il);
  9908. // input for next layer
  9909. inpL = cur;
  9910. }
  9911. cur = inpL;
  9912. cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
  9913. cb(cur, "result_norm", -1);
  9914. res->t_embd = cur;
  9915. cur = build_lora_mm(model.output, cur);
  9916. cb(cur, "result_output", -1);
  9917. res->t_logits = cur;
  9918. ggml_build_forward_expand(gf, cur);
  9919. }
  9920. };
  9921. struct llm_build_arwkv7 : public llm_build_rwkv7_base {
  9922. llm_build_arwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) {
  9923. GGML_ASSERT(n_embd == hparams.n_embd_r());
  9924. ggml_tensor * cur;
  9925. ggml_tensor * inpL;
  9926. ggml_tensor * v_first = nullptr;
  9927. inpL = build_inp_embd(model.tok_embd);
  9928. auto * rs_inp = build_rs_inp();
  9929. const auto n_embd = hparams.n_embd;
  9930. const auto n_seq_tokens = ubatch.n_seq_tokens;
  9931. const auto n_seqs = ubatch.n_seqs;
  9932. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9933. for (int il = 0; il < n_layer; ++il) {
  9934. const llama_layer * layer = &model.layers[il];
  9935. inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
  9936. ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
  9937. ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
  9938. cb(att_norm, "attn_norm", il);
  9939. ggml_tensor * x_prev = ggml_concat(
  9940. ctx0,
  9941. token_shift,
  9942. ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
  9943. 1
  9944. );
  9945. cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
  9946. token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
  9947. ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
  9948. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  9949. cb(ffn_inp, "ffn_inp", il);
  9950. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  9951. ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
  9952. if (il == n_layer - 1 && inp_out_ids) {
  9953. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9954. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  9955. }
  9956. // feed-forward network
  9957. cur = build_norm(ffn_inp,
  9958. model.layers[il].ffn_norm, NULL,
  9959. LLM_NORM_RMS, il);
  9960. cb(cur, "ffn_norm", il);
  9961. cur = build_ffn(cur,
  9962. model.layers[il].ffn_up, NULL, NULL,
  9963. model.layers[il].ffn_gate, NULL, NULL,
  9964. model.layers[il].ffn_down, NULL, NULL,
  9965. NULL,
  9966. LLM_FFN_SILU, LLM_FFN_PAR, il);
  9967. cb(cur, "ffn_out", il);
  9968. cur = ggml_add(ctx0, cur, ffn_inp);
  9969. cur = build_cvec(cur, il);
  9970. cb(cur, "l_out", il);
  9971. // input for next layer
  9972. inpL = cur;
  9973. }
  9974. cur = inpL;
  9975. cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
  9976. cb(cur, "result_norm", -1);
  9977. res->t_embd = cur;
  9978. cur = build_lora_mm(model.output, cur);
  9979. cb(cur, "result_output", -1);
  9980. res->t_logits = cur;
  9981. ggml_build_forward_expand(gf, cur);
  9982. }
  9983. };
  9984. struct llm_build_granite : public llm_graph_context {
  9985. llm_build_granite(
  9986. const llama_model & model,
  9987. const llm_graph_params & params,
  9988. ggml_cgraph * gf,
  9989. const bool use_rope = true)
  9990. : llm_graph_context(params) {
  9991. const int64_t n_embd_head = hparams.n_embd_head_v;
  9992. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9993. GGML_ASSERT(n_embd_head == hparams.n_rot);
  9994. ggml_tensor * cur;
  9995. ggml_tensor * inpL;
  9996. inpL = build_inp_embd(model.tok_embd);
  9997. // inp_pos - built only if rope enabled
  9998. ggml_tensor * inp_pos = nullptr;
  9999. if (use_rope) {
  10000. inp_pos = build_inp_pos();
  10001. }
  10002. auto * inp_attn = build_attn_inp_kv_unified();
  10003. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  10004. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10005. for (int il = 0; il < n_layer; ++il) {
  10006. ggml_tensor * inpSA = inpL;
  10007. // norm
  10008. cur = build_norm(inpL,
  10009. model.layers[il].attn_norm, NULL,
  10010. LLM_NORM_RMS, il);
  10011. cb(cur, "attn_norm", il);
  10012. // self-attention
  10013. {
  10014. // compute Q and K and (optionally) RoPE them
  10015. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  10016. cb(Qcur, "Qcur", il);
  10017. if (model.layers[il].bq) {
  10018. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  10019. cb(Qcur, "Qcur", il);
  10020. }
  10021. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  10022. cb(Kcur, "Kcur", il);
  10023. if (model.layers[il].bk) {
  10024. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  10025. cb(Kcur, "Kcur", il);
  10026. }
  10027. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  10028. cb(Vcur, "Vcur", il);
  10029. if (model.layers[il].bv) {
  10030. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  10031. cb(Vcur, "Vcur", il);
  10032. }
  10033. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10034. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10035. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  10036. if (use_rope) {
  10037. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  10038. Qcur = ggml_rope_ext(
  10039. ctx0, Qcur, inp_pos, rope_factors,
  10040. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10041. ext_factor, attn_factor, beta_fast, beta_slow
  10042. );
  10043. Kcur = ggml_rope_ext(
  10044. ctx0, Kcur, inp_pos, rope_factors,
  10045. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10046. ext_factor, attn_factor, beta_fast, beta_slow
  10047. );
  10048. }
  10049. cb(Qcur, "Qcur", il);
  10050. cb(Kcur, "Kcur", il);
  10051. cb(Vcur, "Vcur", il);
  10052. cur = build_attn(inp_attn, gf,
  10053. model.layers[il].wo, model.layers[il].bo,
  10054. Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
  10055. cb(cur, "attn_out", il);
  10056. }
  10057. if (il == n_layer - 1 && inp_out_ids) {
  10058. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10059. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10060. }
  10061. // For Granite architectures - scale residual
  10062. cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
  10063. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10064. cb(ffn_inp, "ffn_inp", il);
  10065. // feed-forward network (non-MoE)
  10066. if (model.layers[il].ffn_gate_inp == nullptr) {
  10067. cur = build_norm(ffn_inp,
  10068. model.layers[il].ffn_norm, NULL,
  10069. LLM_NORM_RMS, il);
  10070. cb(cur, "ffn_norm", il);
  10071. cur = build_ffn(cur,
  10072. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  10073. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  10074. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  10075. NULL,
  10076. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10077. cb(cur, "ffn_out", il);
  10078. } else {
  10079. // MoE branch
  10080. cur = build_norm(ffn_inp,
  10081. model.layers[il].ffn_norm, NULL,
  10082. LLM_NORM_RMS, il);
  10083. cb(cur, "ffn_norm", il);
  10084. ggml_tensor * moe_out = build_moe_ffn(cur,
  10085. model.layers[il].ffn_gate_inp,
  10086. model.layers[il].ffn_up_exps,
  10087. model.layers[il].ffn_gate_exps,
  10088. model.layers[il].ffn_down_exps,
  10089. nullptr,
  10090. n_expert, n_expert_used,
  10091. LLM_FFN_SILU, true,
  10092. false, 0.0,
  10093. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  10094. il);
  10095. cb(moe_out, "ffn_moe_out", il);
  10096. // For Granite MoE Shared
  10097. if (hparams.n_ff_shexp > 0) {
  10098. ggml_tensor * ffn_shexp = build_ffn(cur,
  10099. model.layers[il].ffn_up_shexp, NULL, NULL,
  10100. model.layers[il].ffn_gate_shexp, NULL, NULL,
  10101. model.layers[il].ffn_down_shexp, NULL, NULL,
  10102. NULL,
  10103. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10104. cb(ffn_shexp, "ffn_shexp", il);
  10105. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  10106. cb(cur, "ffn_out", il);
  10107. } else {
  10108. cur = moe_out;
  10109. }
  10110. }
  10111. // For Granite architectures - scale residual
  10112. cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
  10113. cur = ggml_add(ctx0, cur, ffn_inp);
  10114. cb(cur, "ffn_out", il);
  10115. cur = build_cvec(cur, il);
  10116. cb(cur, "l_out", il);
  10117. // input for next layer
  10118. inpL = cur;
  10119. }
  10120. cur = inpL;
  10121. cur = build_norm(cur,
  10122. model.output_norm, NULL,
  10123. LLM_NORM_RMS, -1);
  10124. cb(cur, "result_norm", -1);
  10125. res->t_embd = cur;
  10126. // lm_head
  10127. cur = build_lora_mm(model.output, cur);
  10128. // For Granite architectures - scale logits
  10129. cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
  10130. cb(cur, "result_output", -1);
  10131. res->t_logits = cur;
  10132. ggml_build_forward_expand(gf, cur);
  10133. }
  10134. };
  10135. // ref: https://github.com/facebookresearch/chameleon
  10136. // based on the original build_llama() function, changes:
  10137. // * qk-norm
  10138. // * swin-norm
  10139. // * removed bias
  10140. // * removed MoE
  10141. struct llm_build_chameleon : public llm_graph_context {
  10142. llm_build_chameleon(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  10143. const int64_t n_embd_head = hparams.n_embd_head_v;
  10144. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10145. GGML_ASSERT(n_embd_head == hparams.n_rot);
  10146. ggml_tensor * cur;
  10147. ggml_tensor * inpL;
  10148. inpL = build_inp_embd(model.tok_embd);
  10149. // inp_pos - contains the positions
  10150. ggml_tensor * inp_pos = build_inp_pos();
  10151. auto * inp_attn = build_attn_inp_kv_unified();
  10152. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10153. for (int il = 0; il < n_layer; ++il) {
  10154. ggml_tensor * inpSA = inpL;
  10155. // norm
  10156. if (hparams.swin_norm) {
  10157. cur = inpL;
  10158. } else {
  10159. cur = build_norm(inpL,
  10160. model.layers[il].attn_norm, NULL,
  10161. LLM_NORM_RMS, il);
  10162. cb(cur, "attn_norm", il);
  10163. }
  10164. // self-attention
  10165. {
  10166. // compute Q and K and RoPE them
  10167. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  10168. cb(Qcur, "Qcur", il);
  10169. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  10170. cb(Kcur, "Kcur", il);
  10171. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  10172. cb(Vcur, "Vcur", il);
  10173. if (model.layers[il].attn_q_norm) {
  10174. Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
  10175. ggml_element_size(Qcur) * n_embd_head,
  10176. ggml_element_size(Qcur) * n_embd_head * n_head,
  10177. 0);
  10178. cb(Qcur, "Qcur", il);
  10179. Qcur = build_norm(Qcur,
  10180. model.layers[il].attn_q_norm,
  10181. model.layers[il].attn_q_norm_b,
  10182. LLM_NORM, il);
  10183. cb(Qcur, "Qcur", il);
  10184. }
  10185. if (model.layers[il].attn_k_norm) {
  10186. Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
  10187. ggml_element_size(Kcur) * n_embd_head,
  10188. ggml_element_size(Kcur) * n_embd_head * n_head_kv,
  10189. 0);
  10190. cb(Kcur, "Kcur", il);
  10191. Kcur = build_norm(Kcur,
  10192. model.layers[il].attn_k_norm,
  10193. model.layers[il].attn_k_norm_b,
  10194. LLM_NORM, il);
  10195. cb(Kcur, "Kcur", il);
  10196. }
  10197. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10198. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10199. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  10200. Qcur = ggml_rope_ext(
  10201. ctx0, Qcur, inp_pos, nullptr,
  10202. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10203. ext_factor, attn_factor, beta_fast, beta_slow
  10204. );
  10205. Kcur = ggml_rope_ext(
  10206. ctx0, Kcur, inp_pos, nullptr,
  10207. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10208. ext_factor, attn_factor, beta_fast, beta_slow
  10209. );
  10210. cb(Qcur, "Qcur", il);
  10211. cb(Kcur, "Kcur", il);
  10212. cb(Vcur, "Vcur", il);
  10213. cur = build_attn(inp_attn, gf,
  10214. model.layers[il].wo, nullptr,
  10215. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  10216. }
  10217. if (il == n_layer - 1 && inp_out_ids) {
  10218. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10219. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10220. }
  10221. if (hparams.swin_norm) {
  10222. cur = build_norm(cur,
  10223. model.layers[il].attn_norm, NULL,
  10224. LLM_NORM_RMS, il);
  10225. }
  10226. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10227. cb(ffn_inp, "ffn_inp", il);
  10228. // feed-forward network
  10229. if (!hparams.swin_norm) {
  10230. cur = build_norm(ffn_inp,
  10231. model.layers[il].ffn_norm, NULL,
  10232. LLM_NORM_RMS, il);
  10233. cb(cur, "ffn_norm", il);
  10234. }
  10235. cur = build_ffn(cur,
  10236. model.layers[il].ffn_up, NULL, NULL,
  10237. model.layers[il].ffn_gate, NULL, NULL,
  10238. model.layers[il].ffn_down, NULL, NULL,
  10239. NULL,
  10240. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10241. cb(cur, "ffn_out", il);
  10242. if (hparams.swin_norm) {
  10243. cur = build_norm(cur,
  10244. model.layers[il].ffn_norm, NULL,
  10245. LLM_NORM_RMS, il);
  10246. cb(cur, "ffn_norm", il);
  10247. }
  10248. cur = ggml_add(ctx0, cur, ffn_inp);
  10249. cb(cur, "ffn_out", il);
  10250. cur = build_cvec(cur, il);
  10251. cb(cur, "l_out", il);
  10252. // input for next layer
  10253. inpL = cur;
  10254. }
  10255. cur = inpL;
  10256. cur = build_norm(cur,
  10257. model.output_norm, NULL,
  10258. LLM_NORM_RMS, -1);
  10259. cb(cur, "result_norm", -1);
  10260. res->t_embd = cur;
  10261. // lm_head
  10262. cur = build_lora_mm(model.output, cur);
  10263. cb(cur, "result_output_with_img_logits", -1);
  10264. // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
  10265. // Needs to be removed once image outputs are supported.
  10266. int img_token_end_idx = 8196;
  10267. int img_token_start_idx = 4;
  10268. int num_img_tokens = img_token_end_idx - img_token_start_idx;
  10269. // creates 1d tensor of size num_img_tokens and values -FLT_MAX,
  10270. // which ensures that text token values are always at least larger than image token values
  10271. ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
  10272. img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
  10273. cb(img_logits, "img_logits", -1);
  10274. cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
  10275. cb(cur, "result_output", -1);
  10276. res->t_logits = cur;
  10277. ggml_build_forward_expand(gf, cur);
  10278. }
  10279. };
  10280. struct llm_build_wavtokenizer_dec : public llm_graph_context {
  10281. llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  10282. ggml_tensor * cur;
  10283. ggml_tensor * inpL;
  10284. inpL = build_inp_embd(model.tok_embd);
  10285. cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
  10286. cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
  10287. cur = ggml_add(ctx0, cur, model.conv1d_b);
  10288. // posnet
  10289. for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
  10290. const auto & layer = model.layers[il].posnet;
  10291. inpL = cur;
  10292. switch (il) {
  10293. case 0:
  10294. case 1:
  10295. case 3:
  10296. case 4:
  10297. {
  10298. cur = build_norm(cur,
  10299. layer.norm1,
  10300. layer.norm1_b,
  10301. LLM_NORM_GROUP, 0);
  10302. cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
  10303. cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
  10304. cur = ggml_add(ctx0, cur, layer.conv1_b);
  10305. cur = build_norm(cur,
  10306. layer.norm2,
  10307. layer.norm2_b,
  10308. LLM_NORM_GROUP, 0);
  10309. cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
  10310. cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
  10311. cur = ggml_add(ctx0, cur, layer.conv2_b);
  10312. cur = ggml_add(ctx0, cur, inpL);
  10313. } break;
  10314. case 2:
  10315. {
  10316. cur = build_norm(cur,
  10317. layer.attn_norm,
  10318. layer.attn_norm_b,
  10319. LLM_NORM_GROUP, 0);
  10320. ggml_tensor * q;
  10321. ggml_tensor * k;
  10322. ggml_tensor * v;
  10323. q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
  10324. k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
  10325. v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
  10326. q = ggml_add(ctx0, q, layer.attn_q_b);
  10327. k = ggml_add(ctx0, k, layer.attn_k_b);
  10328. v = ggml_add(ctx0, v, layer.attn_v_b);
  10329. q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
  10330. k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
  10331. ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
  10332. kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
  10333. cur = ggml_mul_mat(ctx0, kq, v);
  10334. cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
  10335. cur = ggml_add(ctx0, cur, layer.attn_o_b);
  10336. cur = ggml_add(ctx0, cur, inpL);
  10337. } break;
  10338. case 5:
  10339. {
  10340. cur = build_norm(cur,
  10341. layer.norm,
  10342. layer.norm_b,
  10343. LLM_NORM_GROUP, 0);
  10344. } break;
  10345. default: GGML_ABORT("unknown posnet layer");
  10346. };
  10347. }
  10348. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  10349. cur = build_norm(cur,
  10350. model.tok_norm,
  10351. model.tok_norm_b,
  10352. LLM_NORM, -1);
  10353. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  10354. inpL = cur;
  10355. // convnext
  10356. for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
  10357. const auto & layer = model.layers[il].convnext;
  10358. cur = inpL;
  10359. cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
  10360. cur = ggml_add(ctx0, cur, layer.dw_b);
  10361. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  10362. cur = build_norm(cur,
  10363. layer.norm,
  10364. layer.norm_b,
  10365. LLM_NORM, -1);
  10366. cur = build_ffn(cur,
  10367. layer.pw1, layer.pw1_b, NULL,
  10368. NULL, NULL, NULL,
  10369. layer.pw2, layer.pw2_b, NULL,
  10370. NULL,
  10371. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  10372. cur = ggml_mul(ctx0, cur, layer.gamma);
  10373. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  10374. inpL = ggml_add(ctx0, cur, inpL);
  10375. }
  10376. cur = inpL;
  10377. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  10378. cur = build_norm(cur,
  10379. model.output_norm,
  10380. model.output_norm_b,
  10381. LLM_NORM, -1);
  10382. // lm_head
  10383. cur = build_lora_mm(model.output, cur);
  10384. cur = ggml_add(ctx0, cur, model.output_b);
  10385. cb(cur, "result_embd", -1);
  10386. res->t_embd = cur;
  10387. ggml_build_forward_expand(gf, cur);
  10388. }
  10389. };
  10390. struct llm_build_plm : public llm_graph_context {
  10391. llm_build_plm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  10392. const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
  10393. const uint32_t n_embd_head_qk_rope = hparams.n_rot;
  10394. const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  10395. const uint32_t kv_lora_rank = hparams.n_lora_kv;
  10396. ggml_tensor * cur;
  10397. ggml_tensor * inpL;
  10398. // {n_embd, n_tokens}
  10399. inpL = build_inp_embd(model.tok_embd);
  10400. // inp_pos - contains the positions
  10401. ggml_tensor * inp_pos = build_inp_pos();
  10402. auto * inp_attn = build_attn_inp_kv_unified();
  10403. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10404. for (int il = 0; il < n_layer; ++il) {
  10405. ggml_tensor * inpSA = inpL;
  10406. // norm
  10407. cur = build_norm(inpL,
  10408. model.layers[il].attn_norm, NULL,
  10409. LLM_NORM_RMS, il);
  10410. cb(cur, "attn_norm", il);
  10411. // self_attention
  10412. {
  10413. ggml_tensor * q = NULL;
  10414. q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
  10415. cb(q, "q", il);
  10416. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  10417. ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
  10418. ggml_row_size(q->type, hparams.n_embd_head_k),
  10419. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  10420. 0);
  10421. cb(q_nope, "q_nope", il);
  10422. // and {n_head * n_embd_head_qk_rope, n_tokens}
  10423. ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
  10424. ggml_row_size(q->type, hparams.n_embd_head_k),
  10425. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  10426. ggml_row_size(q->type, n_embd_head_qk_nope));
  10427. cb(q_pe, "q_pe", il);
  10428. // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
  10429. ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
  10430. cb(kv_pe_compresseed, "kv_pe_compresseed", il);
  10431. // split into {kv_lora_rank, n_tokens}
  10432. ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
  10433. kv_pe_compresseed->nb[1],
  10434. 0);
  10435. cb(kv_compressed, "kv_compressed", il);
  10436. // and {n_embd_head_qk_rope, n_tokens}
  10437. ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
  10438. kv_pe_compresseed->nb[1],
  10439. kv_pe_compresseed->nb[1],
  10440. ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
  10441. cb(k_pe, "k_pe", il);
  10442. kv_compressed = build_norm(kv_compressed,
  10443. model.layers[il].attn_kv_a_norm, NULL,
  10444. LLM_NORM_RMS, il);
  10445. cb(kv_compressed, "kv_compressed", il);
  10446. // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
  10447. ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
  10448. cb(kv, "kv", il);
  10449. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  10450. ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
  10451. ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
  10452. ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  10453. 0);
  10454. cb(k_nope, "k_nope", il);
  10455. // and {n_head * n_embd_head_v, n_tokens}
  10456. ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
  10457. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  10458. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
  10459. ggml_row_size(kv->type, (n_embd_head_qk_nope)));
  10460. cb(v_states, "v_states", il);
  10461. v_states = ggml_cont(ctx0, v_states);
  10462. cb(v_states, "v_states", il);
  10463. v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
  10464. ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
  10465. 0);
  10466. cb(v_states, "v_states", il);
  10467. q_pe = ggml_rope_ext(
  10468. ctx0, q_pe, inp_pos, nullptr,
  10469. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10470. ext_factor, attn_factor, beta_fast, beta_slow
  10471. );
  10472. cb(q_pe, "q_pe", il);
  10473. // shared RoPE key
  10474. k_pe = ggml_rope_ext(
  10475. ctx0, k_pe, inp_pos, nullptr,
  10476. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10477. ext_factor, attn_factor, beta_fast, beta_slow
  10478. );
  10479. cb(k_pe, "k_pe", il);
  10480. ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
  10481. cb(q_states, "q_states", il);
  10482. ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
  10483. cb(k_states, "k_states", il);
  10484. cur = build_attn(inp_attn, gf,
  10485. model.layers[il].wo, NULL,
  10486. q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
  10487. }
  10488. if (il == n_layer - 1 && inp_out_ids) {
  10489. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10490. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10491. }
  10492. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10493. cb(ffn_inp, "ffn_inp", il);
  10494. cur = build_norm(ffn_inp,
  10495. model.layers[il].ffn_norm, NULL,
  10496. LLM_NORM_RMS, il);
  10497. cb(cur, "ffn_norm", il);
  10498. cur = build_ffn(cur,
  10499. model.layers[il].ffn_up, NULL, NULL,
  10500. NULL, NULL, NULL,
  10501. model.layers[il].ffn_down, NULL, NULL,
  10502. NULL,
  10503. LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
  10504. cb(cur, "ffn_out", il);
  10505. cur = ggml_add(ctx0, cur, ffn_inp);
  10506. cur = build_cvec(cur, il);
  10507. cb(cur, "l_out", il);
  10508. // input for next layer
  10509. inpL = cur;
  10510. }
  10511. cur = inpL;
  10512. cur = build_norm(cur,
  10513. model.output_norm, NULL,
  10514. LLM_NORM_RMS, -1);
  10515. cb(cur, "result_norm", -1);
  10516. res->t_embd = cur;
  10517. cur = build_lora_mm(model.output, cur);
  10518. cb(cur, "result_output", -1);
  10519. res->t_logits = cur;
  10520. ggml_build_forward_expand(gf, cur);
  10521. }
  10522. };
  10523. struct llm_build_bailingmoe : public llm_graph_context {
  10524. llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  10525. ggml_tensor * cur;
  10526. ggml_tensor * inpL;
  10527. inpL = build_inp_embd(model.tok_embd);
  10528. // inp_pos - contains the positions
  10529. ggml_tensor * inp_pos = build_inp_pos();
  10530. auto * inp_attn = build_attn_inp_kv_unified();
  10531. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10532. for (int il = 0; il < n_layer; ++il) {
  10533. ggml_tensor * inpSA = inpL;
  10534. // norm
  10535. cur = build_norm(inpL,
  10536. model.layers[il].attn_norm, NULL,
  10537. LLM_NORM_RMS, il);
  10538. cb(cur, "attn_norm", il);
  10539. // self-attention
  10540. {
  10541. // rope freq factors for llama3; may return nullptr for llama2 and other models
  10542. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  10543. // compute Q and K and RoPE them
  10544. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  10545. cb(Qcur, "Qcur", il);
  10546. if (model.layers[il].bq) {
  10547. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  10548. cb(Qcur, "Qcur", il);
  10549. }
  10550. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  10551. cb(Kcur, "Kcur", il);
  10552. if (model.layers[il].bk) {
  10553. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  10554. cb(Kcur, "Kcur", il);
  10555. }
  10556. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  10557. cb(Vcur, "Vcur", il);
  10558. if (model.layers[il].bv) {
  10559. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  10560. cb(Vcur, "Vcur", il);
  10561. }
  10562. Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
  10563. Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
  10564. Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
  10565. Qcur = ggml_rope_ext(
  10566. ctx0, Qcur, inp_pos, rope_factors,
  10567. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10568. ext_factor, attn_factor, beta_fast, beta_slow
  10569. );
  10570. Kcur = ggml_rope_ext(
  10571. ctx0, Kcur, inp_pos, rope_factors,
  10572. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10573. ext_factor, attn_factor, beta_fast, beta_slow
  10574. );
  10575. cb(Qcur, "Qcur", il);
  10576. cb(Kcur, "Kcur", il);
  10577. cb(Vcur, "Vcur", il);
  10578. cur = build_attn(inp_attn, gf,
  10579. model.layers[il].wo, model.layers[il].bo,
  10580. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
  10581. }
  10582. if (il == n_layer - 1 && inp_out_ids) {
  10583. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10584. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10585. }
  10586. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10587. cb(ffn_inp, "ffn_inp", il);
  10588. cur = build_norm(ffn_inp,
  10589. model.layers[il].ffn_norm, NULL,
  10590. LLM_NORM_RMS, il);
  10591. cb(cur, "ffn_norm", il);
  10592. ggml_tensor * moe_out =
  10593. build_moe_ffn(cur,
  10594. model.layers[il].ffn_gate_inp,
  10595. model.layers[il].ffn_up_exps,
  10596. model.layers[il].ffn_gate_exps,
  10597. model.layers[il].ffn_down_exps,
  10598. nullptr,
  10599. n_expert, n_expert_used,
  10600. LLM_FFN_SILU, hparams.expert_weights_norm,
  10601. false, hparams.expert_weights_scale,
  10602. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  10603. il);
  10604. cb(moe_out, "ffn_moe_out", il);
  10605. // FFN shared expert
  10606. {
  10607. ggml_tensor * ffn_shexp = build_ffn(cur,
  10608. model.layers[il].ffn_up_shexp, NULL, NULL,
  10609. model.layers[il].ffn_gate_shexp, NULL, NULL,
  10610. model.layers[il].ffn_down_shexp, NULL, NULL,
  10611. NULL,
  10612. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10613. cb(ffn_shexp, "ffn_shexp", il);
  10614. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  10615. cb(cur, "ffn_out", il);
  10616. }
  10617. cur = ggml_add(ctx0, cur, ffn_inp);
  10618. cur = build_cvec(cur, il);
  10619. cb(cur, "l_out", il);
  10620. // input for next layer
  10621. inpL = cur;
  10622. }
  10623. cur = inpL;
  10624. cur = build_norm(cur,
  10625. model.output_norm, NULL,
  10626. LLM_NORM_RMS, -1);
  10627. cb(cur, "result_norm", -1);
  10628. res->t_embd = cur;
  10629. // lm_head
  10630. cur = build_lora_mm(model.output, cur);
  10631. cb(cur, "result_output", -1);
  10632. res->t_logits = cur;
  10633. ggml_build_forward_expand(gf, cur);
  10634. }
  10635. };
  10636. struct llm_build_dots1 : public llm_graph_context {
  10637. llm_build_dots1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  10638. const int64_t n_embd_head = hparams.n_embd_head_v;
  10639. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10640. GGML_ASSERT(n_embd_head == hparams.n_rot);
  10641. ggml_tensor * cur;
  10642. ggml_tensor * inpL;
  10643. inpL = build_inp_embd(model.tok_embd);
  10644. // inp_pos - contains the positions
  10645. ggml_tensor * inp_pos = build_inp_pos();
  10646. auto * inp_attn = build_attn_inp_kv_unified();
  10647. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10648. for (int il = 0; il < n_layer; ++il) {
  10649. ggml_tensor * inpSA = inpL;
  10650. // norm
  10651. cur = build_norm(inpL,
  10652. model.layers[il].attn_norm, NULL,
  10653. LLM_NORM_RMS, il);
  10654. cb(cur, "attn_norm", il);
  10655. // self_attention
  10656. {
  10657. // compute Q and K and RoPE them
  10658. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  10659. cb(Qcur, "Qcur", il);
  10660. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  10661. cb(Kcur, "Kcur", il);
  10662. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  10663. cb(Vcur, "Vcur", il);
  10664. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10665. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10666. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  10667. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  10668. cb(Qcur, "Qcur_normed", il);
  10669. Qcur = ggml_rope_ext(
  10670. ctx0, Qcur, inp_pos, nullptr,
  10671. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10672. ext_factor, attn_factor, beta_fast, beta_slow
  10673. );
  10674. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  10675. cb(Kcur, "Kcur_normed", il);
  10676. Kcur = ggml_rope_ext(
  10677. ctx0, Kcur, inp_pos, nullptr,
  10678. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10679. ext_factor, attn_factor, beta_fast, beta_slow
  10680. );
  10681. cb(Qcur, "Qcur", il);
  10682. cb(Kcur, "Kcur", il);
  10683. cb(Vcur, "Vcur", il);
  10684. cur = build_attn(inp_attn, gf,
  10685. model.layers[il].wo, model.layers[il].bo,
  10686. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  10687. }
  10688. if (il == n_layer - 1 && inp_out_ids) {
  10689. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10690. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10691. }
  10692. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10693. cb(ffn_inp, "ffn_inp", il);
  10694. // MoE branch
  10695. cur = build_norm(ffn_inp,
  10696. model.layers[il].ffn_norm, NULL,
  10697. LLM_NORM_RMS, il);
  10698. cb(cur, "ffn_norm", il);
  10699. if ((uint32_t) il < hparams.n_layer_dense_lead) {
  10700. cur = build_ffn(cur,
  10701. model.layers[il].ffn_up, NULL, NULL,
  10702. model.layers[il].ffn_gate, NULL, NULL,
  10703. model.layers[il].ffn_down, NULL, NULL,
  10704. NULL,
  10705. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10706. cb(cur, "ffn_out", il);
  10707. } else {
  10708. ggml_tensor * moe_out =
  10709. build_moe_ffn(cur,
  10710. model.layers[il].ffn_gate_inp,
  10711. model.layers[il].ffn_up_exps,
  10712. model.layers[il].ffn_gate_exps,
  10713. model.layers[il].ffn_down_exps,
  10714. model.layers[il].ffn_exp_probs_b,
  10715. n_expert, n_expert_used,
  10716. LLM_FFN_SILU, hparams.expert_weights_norm,
  10717. true, hparams.expert_weights_scale,
  10718. (llama_expert_gating_func_type) hparams.expert_gating_func,
  10719. il);
  10720. cb(moe_out, "ffn_moe_out", il);
  10721. {
  10722. ggml_tensor * ffn_shexp = build_ffn(cur,
  10723. model.layers[il].ffn_up_shexp, NULL, NULL,
  10724. model.layers[il].ffn_gate_shexp, NULL, NULL,
  10725. model.layers[il].ffn_down_shexp, NULL, NULL,
  10726. NULL,
  10727. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10728. cb(ffn_shexp, "ffn_shexp", il);
  10729. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  10730. cb(cur, "ffn_out", il);
  10731. }
  10732. }
  10733. cur = ggml_add(ctx0, cur, ffn_inp);
  10734. cur = build_cvec(cur, il);
  10735. cb(cur, "l_out", il);
  10736. // input for next layer
  10737. inpL = cur;
  10738. }
  10739. cur = inpL;
  10740. cur = build_norm(cur,
  10741. model.output_norm, NULL,
  10742. LLM_NORM_RMS, -1);
  10743. cb(cur, "result_norm", -1);
  10744. res->t_embd = cur;
  10745. // lm_head
  10746. cur = build_lora_mm(model.output, cur);
  10747. cb(cur, "result_output", -1);
  10748. res->t_logits = cur;
  10749. ggml_build_forward_expand(gf, cur);
  10750. }
  10751. };
  10752. struct llm_build_arcee : public llm_graph_context {
  10753. llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  10754. const int64_t n_embd_head = hparams.n_embd_head_v;
  10755. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10756. GGML_ASSERT(n_embd_head == hparams.n_rot);
  10757. ggml_tensor * cur;
  10758. ggml_tensor * inpL;
  10759. inpL = build_inp_embd(model.tok_embd);
  10760. // inp_pos - contains the positions
  10761. ggml_tensor * inp_pos = build_inp_pos();
  10762. auto * inp_attn = build_attn_inp_kv_unified();
  10763. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  10764. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10765. for (int il = 0; il < n_layer; ++il) {
  10766. ggml_tensor * inpSA = inpL;
  10767. // norm
  10768. cur = build_norm(inpL,
  10769. model.layers[il].attn_norm, NULL,
  10770. LLM_NORM_RMS, il);
  10771. cb(cur, "attn_norm", il);
  10772. // self-attention
  10773. {
  10774. // rope freq factors for llama3; may return nullptr for llama2 and other models
  10775. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  10776. // compute Q and K and RoPE them
  10777. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  10778. cb(Qcur, "Qcur", il);
  10779. if (model.layers[il].bq) {
  10780. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  10781. cb(Qcur, "Qcur", il);
  10782. }
  10783. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  10784. cb(Kcur, "Kcur", il);
  10785. if (model.layers[il].bk) {
  10786. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  10787. cb(Kcur, "Kcur", il);
  10788. }
  10789. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  10790. cb(Vcur, "Vcur", il);
  10791. if (model.layers[il].bv) {
  10792. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  10793. cb(Vcur, "Vcur", il);
  10794. }
  10795. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10796. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10797. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  10798. Qcur = ggml_rope_ext(
  10799. ctx0, Qcur, inp_pos, rope_factors,
  10800. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10801. ext_factor, attn_factor, beta_fast, beta_slow
  10802. );
  10803. Kcur = ggml_rope_ext(
  10804. ctx0, Kcur, inp_pos, rope_factors,
  10805. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10806. ext_factor, attn_factor, beta_fast, beta_slow
  10807. );
  10808. cb(Qcur, "Qcur", il);
  10809. cb(Kcur, "Kcur", il);
  10810. cb(Vcur, "Vcur", il);
  10811. cur = build_attn(inp_attn, gf,
  10812. model.layers[il].wo, model.layers[il].bo,
  10813. Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
  10814. cb(cur, "attn_out", il);
  10815. }
  10816. if (il == n_layer - 1 && inp_out_ids) {
  10817. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10818. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10819. }
  10820. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10821. cb(ffn_inp, "ffn_inp", il);
  10822. // feed-forward network
  10823. // ARCEE uses relu^2 instead of silu
  10824. cur = build_norm(ffn_inp,
  10825. model.layers[il].ffn_norm, NULL,
  10826. LLM_NORM_RMS, il);
  10827. cb(cur, "ffn_norm", il);
  10828. cur = build_ffn(cur,
  10829. model.layers[il].ffn_up, NULL, NULL,
  10830. NULL, NULL, NULL,
  10831. model.layers[il].ffn_down, NULL, NULL,
  10832. NULL,
  10833. LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
  10834. cb(cur, "ffn_out", il);
  10835. cur = ggml_add(ctx0, cur, ffn_inp);
  10836. cb(cur, "ffn_out", il);
  10837. cur = build_cvec(cur, il);
  10838. cb(cur, "l_out", il);
  10839. // input for next layer
  10840. inpL = cur;
  10841. }
  10842. cur = inpL;
  10843. cur = build_norm(cur,
  10844. model.output_norm, NULL,
  10845. LLM_NORM_RMS, -1);
  10846. cb(cur, "result_norm", -1);
  10847. res->t_embd = cur;
  10848. // lm_head
  10849. cur = build_lora_mm(model.output, cur);
  10850. cb(cur, "result_output", -1);
  10851. res->t_logits = cur;
  10852. ggml_build_forward_expand(gf, cur);
  10853. }
  10854. };
  10855. llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
  10856. llama_memory_i * res;
  10857. switch (arch) {
  10858. // Models that need specific instantiation should be handled in the
  10859. // switch statement
  10860. case LLM_ARCH_BERT:
  10861. case LLM_ARCH_JINA_BERT_V2:
  10862. case LLM_ARCH_NOMIC_BERT:
  10863. case LLM_ARCH_NOMIC_BERT_MOE:
  10864. case LLM_ARCH_NEO_BERT:
  10865. case LLM_ARCH_WAVTOKENIZER_DEC:
  10866. {
  10867. res = nullptr;
  10868. } break;
  10869. // Models that need standard caching should rely on recurrent/hybrid
  10870. // checks
  10871. default:
  10872. {
  10873. if (llm_arch_is_recurrent(arch)) {
  10874. res = new llama_memory_recurrent(
  10875. *this,
  10876. nullptr,
  10877. GGML_TYPE_F32,
  10878. GGML_TYPE_F32,
  10879. cparams.offload_kqv,
  10880. std::max((uint32_t) 1, cparams.n_seq_max),
  10881. cparams.n_seq_max);
  10882. } else if (llm_arch_is_hybrid(arch)) {
  10883. const auto padding = llama_kv_cache_unified::get_padding(cparams);
  10884. cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
  10885. res = new llama_memory_hybrid(
  10886. /* model */ *this,
  10887. /* attn_type_k */ params.type_k,
  10888. /* attn_type_v */ params.type_v,
  10889. /* attn_v_trans */ !cparams.flash_attn,
  10890. /* attn_kv_size */ cparams.n_ctx,
  10891. /* attn_n_pad */ padding,
  10892. /* attn_n_swa */ hparams.n_swa,
  10893. /* attn_swa_type */ hparams.swa_type,
  10894. /* recurrent_type_k */ GGML_TYPE_F32,
  10895. /* recurrent_type_v */ GGML_TYPE_F32,
  10896. /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
  10897. /* n_seq_max */ cparams.n_seq_max,
  10898. /* offload */ cparams.offload_kqv);
  10899. } else {
  10900. const auto padding = llama_kv_cache_unified::get_padding(cparams);
  10901. cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
  10902. LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
  10903. if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
  10904. GGML_ASSERT(hparams.is_swa_any());
  10905. res = new llama_kv_cache_unified_iswa(
  10906. *this,
  10907. params.type_k,
  10908. params.type_v,
  10909. !cparams.flash_attn,
  10910. cparams.offload_kqv,
  10911. params.swa_full,
  10912. cparams.n_ctx,
  10913. cparams.n_seq_max,
  10914. cparams.n_ubatch,
  10915. padding);
  10916. } else {
  10917. GGML_ASSERT(!hparams.is_swa_any());
  10918. res = new llama_kv_cache_unified(
  10919. *this,
  10920. nullptr,
  10921. params.type_k,
  10922. params.type_v,
  10923. !cparams.flash_attn,
  10924. cparams.offload_kqv,
  10925. cparams.n_ctx,
  10926. cparams.n_seq_max,
  10927. padding,
  10928. hparams.n_swa,
  10929. hparams.swa_type);
  10930. }
  10931. }
  10932. }
  10933. }
  10934. return res;
  10935. }
  10936. llm_graph_result_ptr llama_model::build_graph(
  10937. const llm_graph_params & params,
  10938. ggml_cgraph * gf,
  10939. llm_graph_type type) const {
  10940. std::unique_ptr<llm_graph_context> llm;
  10941. switch (arch) {
  10942. case LLM_ARCH_LLAMA:
  10943. {
  10944. llm = std::make_unique<llm_build_llama>(*this, params, gf);
  10945. } break;
  10946. case LLM_ARCH_LLAMA4:
  10947. {
  10948. llm = std::make_unique<llm_build_llama_iswa>(*this, params, gf);
  10949. } break;
  10950. case LLM_ARCH_DECI:
  10951. {
  10952. llm = std::make_unique<llm_build_deci>(*this, params, gf);
  10953. } break;
  10954. case LLM_ARCH_BAICHUAN:
  10955. {
  10956. llm = std::make_unique<llm_build_baichuan>(*this, params, gf);
  10957. } break;
  10958. case LLM_ARCH_FALCON:
  10959. {
  10960. llm = std::make_unique<llm_build_falcon>(*this, params, gf);
  10961. } break;
  10962. case LLM_ARCH_GROK:
  10963. {
  10964. llm = std::make_unique<llm_build_grok>(*this, params, gf);
  10965. } break;
  10966. case LLM_ARCH_STARCODER:
  10967. {
  10968. llm = std::make_unique<llm_build_starcoder>(*this, params, gf);
  10969. } break;
  10970. case LLM_ARCH_REFACT:
  10971. {
  10972. llm = std::make_unique<llm_build_refact>(*this, params, gf);
  10973. } break;
  10974. case LLM_ARCH_BERT:
  10975. case LLM_ARCH_JINA_BERT_V2:
  10976. case LLM_ARCH_NOMIC_BERT:
  10977. case LLM_ARCH_NOMIC_BERT_MOE:
  10978. {
  10979. llm = std::make_unique<llm_build_bert>(*this, params, gf);
  10980. } break;
  10981. case LLM_ARCH_NEO_BERT:
  10982. {
  10983. llm = std::make_unique<llm_build_neo_bert>(*this, params, gf);
  10984. } break;
  10985. case LLM_ARCH_BLOOM:
  10986. {
  10987. llm = std::make_unique<llm_build_bloom>(*this, params, gf);
  10988. } break;
  10989. case LLM_ARCH_MPT:
  10990. {
  10991. llm = std::make_unique<llm_build_mpt>(*this, params, gf);
  10992. } break;
  10993. case LLM_ARCH_STABLELM:
  10994. {
  10995. llm = std::make_unique<llm_build_stablelm>(*this, params, gf);
  10996. } break;
  10997. case LLM_ARCH_QWEN:
  10998. {
  10999. llm = std::make_unique<llm_build_qwen>(*this, params, gf);
  11000. } break;
  11001. case LLM_ARCH_QWEN2:
  11002. {
  11003. llm = std::make_unique<llm_build_qwen2>(*this, params, gf);
  11004. } break;
  11005. case LLM_ARCH_QWEN2VL:
  11006. {
  11007. llm = std::make_unique<llm_build_qwen2vl>(*this, params, gf);
  11008. } break;
  11009. case LLM_ARCH_QWEN2MOE:
  11010. {
  11011. llm = std::make_unique<llm_build_qwen2moe>(*this, params, gf);
  11012. } break;
  11013. case LLM_ARCH_QWEN3:
  11014. {
  11015. llm = std::make_unique<llm_build_qwen3>(*this, params, gf);
  11016. } break;
  11017. case LLM_ARCH_QWEN3MOE:
  11018. {
  11019. llm = std::make_unique<llm_build_qwen3moe>(*this, params, gf);
  11020. } break;
  11021. case LLM_ARCH_PHI2:
  11022. {
  11023. llm = std::make_unique<llm_build_phi2>(*this, params, gf);
  11024. } break;
  11025. case LLM_ARCH_PHI3:
  11026. case LLM_ARCH_PHIMOE:
  11027. {
  11028. if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
  11029. llm = std::make_unique<llm_build_phi3<true>> (*this, params, gf);
  11030. } else {
  11031. llm = std::make_unique<llm_build_phi3<false>>(*this, params, gf);
  11032. }
  11033. } break;
  11034. case LLM_ARCH_PLAMO:
  11035. {
  11036. llm = std::make_unique<llm_build_plamo>(*this, params, gf);
  11037. } break;
  11038. case LLM_ARCH_GPT2:
  11039. {
  11040. llm = std::make_unique<llm_build_gpt2>(*this, params, gf);
  11041. } break;
  11042. case LLM_ARCH_CODESHELL:
  11043. {
  11044. llm = std::make_unique<llm_build_codeshell>(*this, params, gf);
  11045. } break;
  11046. case LLM_ARCH_ORION:
  11047. {
  11048. llm = std::make_unique<llm_build_orion>(*this, params, gf);
  11049. } break;
  11050. case LLM_ARCH_INTERNLM2:
  11051. {
  11052. llm = std::make_unique<llm_build_internlm2>(*this, params, gf);
  11053. } break;
  11054. case LLM_ARCH_MINICPM3:
  11055. {
  11056. llm = std::make_unique<llm_build_minicpm3>(*this, params, gf);
  11057. } break;
  11058. case LLM_ARCH_GEMMA:
  11059. {
  11060. llm = std::make_unique<llm_build_gemma>(*this, params, gf);
  11061. } break;
  11062. case LLM_ARCH_GEMMA2:
  11063. {
  11064. llm = std::make_unique<llm_build_gemma2_iswa>(*this, params, gf);
  11065. } break;
  11066. case LLM_ARCH_GEMMA3:
  11067. {
  11068. llm = std::make_unique<llm_build_gemma3_iswa>(*this, params, gf);
  11069. } break;
  11070. case LLM_ARCH_STARCODER2:
  11071. {
  11072. llm = std::make_unique<llm_build_starcoder2>(*this, params, gf);
  11073. } break;
  11074. case LLM_ARCH_MAMBA:
  11075. {
  11076. llm = std::make_unique<llm_build_mamba>(*this, params, gf);
  11077. } break;
  11078. case LLM_ARCH_XVERSE:
  11079. {
  11080. llm = std::make_unique<llm_build_xverse>(*this, params, gf);
  11081. } break;
  11082. case LLM_ARCH_COMMAND_R:
  11083. {
  11084. llm = std::make_unique<llm_build_command_r>(*this, params, gf);
  11085. } break;
  11086. case LLM_ARCH_COHERE2:
  11087. {
  11088. llm = std::make_unique<llm_build_cohere2_iswa>(*this, params, gf);
  11089. } break;
  11090. case LLM_ARCH_DBRX:
  11091. {
  11092. llm = std::make_unique<llm_build_dbrx>(*this, params, gf);
  11093. } break;
  11094. case LLM_ARCH_OLMO:
  11095. {
  11096. llm = std::make_unique<llm_build_olmo>(*this, params, gf);
  11097. } break;
  11098. case LLM_ARCH_OLMO2:
  11099. {
  11100. llm = std::make_unique<llm_build_olmo2>(*this, params, gf);
  11101. } break;
  11102. case LLM_ARCH_OLMOE:
  11103. {
  11104. llm = std::make_unique<llm_build_olmoe>(*this, params, gf);
  11105. } break;
  11106. case LLM_ARCH_OPENELM:
  11107. {
  11108. llm = std::make_unique<llm_build_openelm>(*this, params, gf);
  11109. } break;
  11110. case LLM_ARCH_GPTNEOX:
  11111. {
  11112. llm = std::make_unique<llm_build_gptneox>(*this, params, gf);
  11113. } break;
  11114. case LLM_ARCH_ARCTIC:
  11115. {
  11116. llm = std::make_unique<llm_build_arctic>(*this, params, gf);
  11117. } break;
  11118. case LLM_ARCH_DEEPSEEK:
  11119. {
  11120. llm = std::make_unique<llm_build_deepseek>(*this, params, gf);
  11121. } break;
  11122. case LLM_ARCH_DEEPSEEK2:
  11123. {
  11124. llm = std::make_unique<llm_build_deepseek2>(*this, params, gf);
  11125. } break;
  11126. case LLM_ARCH_CHATGLM:
  11127. {
  11128. llm = std::make_unique<llm_build_chatglm>(*this, params, gf);
  11129. } break;
  11130. case LLM_ARCH_GLM4:
  11131. {
  11132. llm = std::make_unique<llm_build_glm4>(*this, params, gf);
  11133. } break;
  11134. case LLM_ARCH_BITNET:
  11135. {
  11136. llm = std::make_unique<llm_build_bitnet>(*this, params, gf);
  11137. } break;
  11138. case LLM_ARCH_T5:
  11139. {
  11140. switch (type) {
  11141. case LLM_GRAPH_TYPE_ENCODER:
  11142. llm = std::make_unique<llm_build_t5_enc>(*this, params, gf);
  11143. break;
  11144. case LLM_GRAPH_TYPE_DEFAULT:
  11145. case LLM_GRAPH_TYPE_DECODER:
  11146. llm = std::make_unique<llm_build_t5_dec>(*this, params, gf);
  11147. break;
  11148. default:
  11149. GGML_ABORT("invalid graph type");
  11150. };
  11151. } break;
  11152. case LLM_ARCH_T5ENCODER:
  11153. {
  11154. llm = std::make_unique<llm_build_t5_enc>(*this, params, gf);
  11155. }
  11156. break;
  11157. case LLM_ARCH_JAIS:
  11158. {
  11159. llm = std::make_unique<llm_build_jais>(*this, params, gf);
  11160. } break;
  11161. case LLM_ARCH_NEMOTRON:
  11162. {
  11163. llm = std::make_unique<llm_build_nemotron>(*this, params, gf);
  11164. } break;
  11165. case LLM_ARCH_EXAONE:
  11166. {
  11167. llm = std::make_unique<llm_build_exaone>(*this, params, gf);
  11168. } break;
  11169. case LLM_ARCH_RWKV6:
  11170. {
  11171. llm = std::make_unique<llm_build_rwkv6>(*this, params, gf);
  11172. } break;
  11173. case LLM_ARCH_RWKV6QWEN2:
  11174. {
  11175. llm = std::make_unique<llm_build_rwkv6qwen2>(*this, params, gf);
  11176. } break;
  11177. case LLM_ARCH_RWKV7:
  11178. {
  11179. llm = std::make_unique<llm_build_rwkv7>(*this, params, gf);
  11180. } break;
  11181. case LLM_ARCH_ARWKV7:
  11182. {
  11183. llm = std::make_unique<llm_build_arwkv7>(*this, params, gf);
  11184. } break;
  11185. case LLM_ARCH_GRANITE:
  11186. case LLM_ARCH_GRANITE_MOE:
  11187. case LLM_ARCH_MINICPM:
  11188. {
  11189. llm = std::make_unique<llm_build_granite>(*this, params, gf);
  11190. } break;
  11191. case LLM_ARCH_CHAMELEON:
  11192. {
  11193. llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
  11194. } break;
  11195. case LLM_ARCH_WAVTOKENIZER_DEC:
  11196. {
  11197. llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
  11198. } break;
  11199. case LLM_ARCH_PLM:
  11200. {
  11201. llm = std::make_unique<llm_build_plm>(*this, params, gf);
  11202. } break;
  11203. case LLM_ARCH_BAILINGMOE:
  11204. {
  11205. llm = std::make_unique<llm_build_bailingmoe>(*this, params, gf);
  11206. } break;
  11207. case LLM_ARCH_DOTS1:
  11208. {
  11209. llm = std::make_unique<llm_build_dots1>(*this, params, gf);
  11210. } break;
  11211. case LLM_ARCH_ARCEE:
  11212. {
  11213. llm = std::make_unique<llm_build_arcee>(*this, params, gf);
  11214. } break;
  11215. default:
  11216. GGML_ABORT("fatal error");
  11217. }
  11218. // add on pooling layer
  11219. llm->build_pooling(gf, cls, cls_b, cls_out, cls_out_b);
  11220. return std::move(llm->res);
  11221. }
  11222. //
  11223. // interface implementation
  11224. //
  11225. llama_model_params llama_model_default_params() {
  11226. llama_model_params result = {
  11227. /*.devices =*/ nullptr,
  11228. /*.tensor_buft_overrides =*/ nullptr,
  11229. /*.n_gpu_layers =*/ 0,
  11230. /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
  11231. /*.main_gpu =*/ 0,
  11232. /*.tensor_split =*/ nullptr,
  11233. /*.progress_callback =*/ nullptr,
  11234. /*.progress_callback_user_data =*/ nullptr,
  11235. /*.kv_overrides =*/ nullptr,
  11236. /*.vocab_only =*/ false,
  11237. /*.use_mmap =*/ true,
  11238. /*.use_mlock =*/ false,
  11239. /*.check_tensors =*/ false,
  11240. };
  11241. #ifdef GGML_USE_METAL
  11242. // note: we usually have plenty of VRAM, so by default offload all layers to the GPU
  11243. result.n_gpu_layers = 999;
  11244. #endif
  11245. return result;
  11246. }
  11247. const llama_vocab * llama_model_get_vocab(const llama_model * model) {
  11248. return &model->vocab;
  11249. }
  11250. void llama_free_model(llama_model * model) {
  11251. llama_model_free(model);
  11252. }
  11253. void llama_model_free(llama_model * model) {
  11254. delete model;
  11255. }
  11256. int32_t llama_model_n_ctx_train(const llama_model * model) {
  11257. return model->hparams.n_ctx_train;
  11258. }
  11259. int32_t llama_model_n_embd(const llama_model * model) {
  11260. return model->hparams.n_embd;
  11261. }
  11262. int32_t llama_model_n_layer(const llama_model * model) {
  11263. return model->hparams.n_layer;
  11264. }
  11265. int32_t llama_model_n_head(const llama_model * model) {
  11266. return model->hparams.n_head();
  11267. }
  11268. int32_t llama_model_n_head_kv(const llama_model * model) {
  11269. return model->hparams.n_head_kv();
  11270. }
  11271. int32_t llama_model_n_swa(const llama_model * model) {
  11272. return model->hparams.n_swa;
  11273. }
  11274. uint32_t llama_model_n_cls_out(const struct llama_model * model) {
  11275. return model->hparams.n_cls_out;
  11276. }
  11277. const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
  11278. if (i < model->classifier_labels.size()) {
  11279. return model->classifier_labels[i].c_str();
  11280. }
  11281. return nullptr;
  11282. }
  11283. // deprecated
  11284. int32_t llama_n_ctx_train(const llama_model * model) {
  11285. return llama_model_n_ctx_train(model);
  11286. }
  11287. // deprecated
  11288. int32_t llama_n_embd(const llama_model * model) {
  11289. return llama_model_n_embd(model);
  11290. }
  11291. // deprecated
  11292. int32_t llama_n_layer(const llama_model * model) {
  11293. return llama_model_n_layer(model);
  11294. }
  11295. // deprecated
  11296. int32_t llama_n_head(const llama_model * model) {
  11297. return llama_model_n_head(model);
  11298. }
  11299. llama_rope_type llama_model_rope_type(const llama_model * model) {
  11300. switch (model->arch) {
  11301. // these models do not use RoPE
  11302. case LLM_ARCH_GPT2:
  11303. case LLM_ARCH_GPTJ:
  11304. case LLM_ARCH_MPT:
  11305. case LLM_ARCH_REFACT:
  11306. case LLM_ARCH_BLOOM:
  11307. case LLM_ARCH_MAMBA:
  11308. case LLM_ARCH_JINA_BERT_V2:
  11309. case LLM_ARCH_T5:
  11310. case LLM_ARCH_T5ENCODER:
  11311. case LLM_ARCH_JAIS:
  11312. case LLM_ARCH_RWKV6:
  11313. case LLM_ARCH_RWKV6QWEN2:
  11314. case LLM_ARCH_RWKV7:
  11315. case LLM_ARCH_ARWKV7:
  11316. case LLM_ARCH_WAVTOKENIZER_DEC:
  11317. return LLAMA_ROPE_TYPE_NONE;
  11318. // use what we call a normal RoPE, operating on pairs of consecutive head values
  11319. case LLM_ARCH_LLAMA:
  11320. case LLM_ARCH_LLAMA4:
  11321. case LLM_ARCH_DECI:
  11322. case LLM_ARCH_BAICHUAN:
  11323. case LLM_ARCH_STARCODER:
  11324. case LLM_ARCH_INTERNLM2:
  11325. case LLM_ARCH_MINICPM:
  11326. case LLM_ARCH_XVERSE:
  11327. case LLM_ARCH_COMMAND_R:
  11328. case LLM_ARCH_COHERE2:
  11329. case LLM_ARCH_OLMO:
  11330. case LLM_ARCH_ARCTIC:
  11331. case LLM_ARCH_DEEPSEEK:
  11332. case LLM_ARCH_DEEPSEEK2:
  11333. case LLM_ARCH_PLM:
  11334. case LLM_ARCH_CHATGLM:
  11335. case LLM_ARCH_GLM4:
  11336. case LLM_ARCH_GRANITE:
  11337. case LLM_ARCH_GRANITE_MOE:
  11338. case LLM_ARCH_CHAMELEON:
  11339. case LLM_ARCH_BAILINGMOE:
  11340. case LLM_ARCH_NEO_BERT:
  11341. case LLM_ARCH_ARCEE:
  11342. return LLAMA_ROPE_TYPE_NORM;
  11343. // the pairs of head values are offset by n_rot/2
  11344. case LLM_ARCH_FALCON:
  11345. case LLM_ARCH_GROK:
  11346. case LLM_ARCH_DBRX:
  11347. case LLM_ARCH_BERT:
  11348. case LLM_ARCH_NOMIC_BERT:
  11349. case LLM_ARCH_NOMIC_BERT_MOE:
  11350. case LLM_ARCH_STABLELM:
  11351. case LLM_ARCH_BITNET:
  11352. case LLM_ARCH_QWEN:
  11353. case LLM_ARCH_QWEN2:
  11354. case LLM_ARCH_QWEN2MOE:
  11355. case LLM_ARCH_QWEN3:
  11356. case LLM_ARCH_QWEN3MOE:
  11357. case LLM_ARCH_OLMO2:
  11358. case LLM_ARCH_OLMOE:
  11359. case LLM_ARCH_PHI2:
  11360. case LLM_ARCH_PHI3:
  11361. case LLM_ARCH_PHIMOE:
  11362. case LLM_ARCH_PLAMO:
  11363. case LLM_ARCH_GEMMA:
  11364. case LLM_ARCH_GEMMA2:
  11365. case LLM_ARCH_GEMMA3:
  11366. case LLM_ARCH_STARCODER2:
  11367. case LLM_ARCH_OPENELM:
  11368. case LLM_ARCH_GPTNEOX:
  11369. case LLM_ARCH_CODESHELL:
  11370. case LLM_ARCH_ORION:
  11371. case LLM_ARCH_NEMOTRON:
  11372. case LLM_ARCH_EXAONE:
  11373. case LLM_ARCH_MINICPM3:
  11374. case LLM_ARCH_DOTS1:
  11375. return LLAMA_ROPE_TYPE_NEOX;
  11376. case LLM_ARCH_QWEN2VL:
  11377. return LLAMA_ROPE_TYPE_MROPE;
  11378. // all model arches should be listed explicitly here
  11379. case LLM_ARCH_UNKNOWN:
  11380. GGML_ABORT("unknown architecture");
  11381. }
  11382. return LLAMA_ROPE_TYPE_NONE;
  11383. }
  11384. float llama_model_rope_freq_scale_train(const llama_model * model) {
  11385. return model->hparams.rope_freq_scale_train;
  11386. }
  11387. int32_t llama_model_meta_val_str(const llama_model * model, const char * key, char * buf, size_t buf_size) {
  11388. const auto & it = model->gguf_kv.find(key);
  11389. if (it == model->gguf_kv.end()) {
  11390. if (buf_size > 0) {
  11391. buf[0] = '\0';
  11392. }
  11393. return -1;
  11394. }
  11395. return snprintf(buf, buf_size, "%s", it->second.c_str());
  11396. }
  11397. int32_t llama_model_meta_count(const llama_model * model) {
  11398. return (int)model->gguf_kv.size();
  11399. }
  11400. int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
  11401. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  11402. if (buf_size > 0) {
  11403. buf[0] = '\0';
  11404. }
  11405. return -1;
  11406. }
  11407. auto it = model->gguf_kv.begin();
  11408. std::advance(it, i);
  11409. return snprintf(buf, buf_size, "%s", it->first.c_str());
  11410. }
  11411. int32_t llama_model_meta_val_str_by_index(const llama_model * model, int32_t i, char * buf, size_t buf_size) {
  11412. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  11413. if (buf_size > 0) {
  11414. buf[0] = '\0';
  11415. }
  11416. return -1;
  11417. }
  11418. auto it = model->gguf_kv.begin();
  11419. std::advance(it, i);
  11420. return snprintf(buf, buf_size, "%s", it->second.c_str());
  11421. }
  11422. int32_t llama_model_desc(const llama_model * model, char * buf, size_t buf_size) {
  11423. return snprintf(buf, buf_size, "%s", model->desc().c_str());
  11424. }
  11425. uint64_t llama_model_size(const llama_model * model) {
  11426. return model->size();
  11427. }
  11428. const char * llama_model_chat_template(const llama_model * model, const char * name) {
  11429. const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
  11430. : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
  11431. const auto & it = model->gguf_kv.find(key);
  11432. if (it == model->gguf_kv.end()) {
  11433. // one-off fix for very popular models (so we are not flooded with issues)
  11434. // do not extend this list unless absolutely necessary
  11435. // Mistral-Small-2503 does not have built-in chat template
  11436. llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
  11437. if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
  11438. return "mistral-v7-tekken";
  11439. }
  11440. return nullptr;
  11441. }
  11442. return it->second.c_str();
  11443. }
  11444. uint64_t llama_model_n_params(const llama_model * model) {
  11445. return model->n_elements();
  11446. }
  11447. bool llama_model_has_encoder(const llama_model * model) {
  11448. switch (model->arch) {
  11449. case LLM_ARCH_T5: return true;
  11450. case LLM_ARCH_T5ENCODER: return true;
  11451. default: return false;
  11452. }
  11453. }
  11454. bool llama_model_has_decoder(const llama_model * model) {
  11455. switch (model->arch) {
  11456. case LLM_ARCH_T5ENCODER: return false;
  11457. default: return true;
  11458. }
  11459. }
  11460. llama_token llama_model_decoder_start_token(const llama_model * model) {
  11461. return model->hparams.dec_start_token_id;
  11462. }
  11463. bool llama_model_is_recurrent(const llama_model * model) {
  11464. return llm_arch_is_recurrent(model->arch);
  11465. }
  11466. const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
  11467. return model->tensors_by_name;
  11468. }