llama-model.cpp 549 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558855985608561856285638564856585668567856885698570857185728573857485758576857785788579858085818582858385848585858685878588858985908591859285938594859585968597859885998600860186028603860486058606860786088609861086118612861386148615861686178618861986208621862286238624862586268627862886298630863186328633863486358636863786388639864086418642864386448645864686478648864986508651865286538654865586568657865886598660866186628663866486658666866786688669867086718672867386748675867686778678867986808681868286838684868586868687868886898690869186928693869486958696869786988699870087018702870387048705870687078708870987108711871287138714871587168717871887198720872187228723872487258726872787288729873087318732873387348735873687378738873987408741874287438744874587468747874887498750875187528753875487558756875787588759876087618762876387648765876687678768876987708771877287738774877587768777877887798780878187828783878487858786878787888789879087918792879387948795879687978798879988008801880288038804880588068807880888098810881188128813881488158816881788188819882088218822882388248825882688278828882988308831883288338834883588368837883888398840884188428843884488458846884788488849885088518852885388548855885688578858885988608861886288638864886588668867886888698870887188728873887488758876887788788879888088818882888388848885888688878888888988908891889288938894889588968897889888998900890189028903890489058906890789088909891089118912891389148915891689178918891989208921892289238924892589268927892889298930893189328933893489358936893789388939894089418942894389448945894689478948894989508951895289538954895589568957895889598960896189628963896489658966896789688969897089718972897389748975897689778978897989808981898289838984898589868987898889898990899189928993899489958996899789988999900090019002900390049005900690079008900990109011901290139014901590169017901890199020902190229023902490259026902790289029903090319032903390349035903690379038903990409041904290439044904590469047904890499050905190529053905490559056905790589059906090619062906390649065906690679068906990709071907290739074907590769077907890799080908190829083908490859086908790889089909090919092909390949095909690979098909991009101910291039104910591069107910891099110911191129113911491159116911791189119912091219122912391249125912691279128912991309131913291339134913591369137913891399140914191429143914491459146914791489149915091519152915391549155915691579158915991609161916291639164916591669167916891699170917191729173917491759176917791789179918091819182918391849185918691879188918991909191919291939194919591969197919891999200920192029203920492059206920792089209921092119212921392149215921692179218921992209221922292239224922592269227922892299230923192329233923492359236923792389239924092419242924392449245924692479248924992509251925292539254925592569257925892599260926192629263926492659266926792689269927092719272927392749275927692779278927992809281928292839284928592869287928892899290929192929293929492959296929792989299930093019302930393049305930693079308930993109311931293139314931593169317931893199320932193229323932493259326932793289329933093319332933393349335933693379338933993409341934293439344934593469347934893499350935193529353935493559356935793589359936093619362936393649365936693679368936993709371937293739374937593769377937893799380938193829383938493859386938793889389939093919392939393949395939693979398939994009401940294039404940594069407940894099410941194129413941494159416941794189419942094219422942394249425942694279428942994309431943294339434943594369437943894399440944194429443944494459446944794489449945094519452945394549455945694579458945994609461946294639464946594669467946894699470947194729473947494759476947794789479948094819482948394849485948694879488948994909491949294939494949594969497949894999500950195029503950495059506950795089509951095119512951395149515951695179518951995209521952295239524952595269527952895299530953195329533953495359536953795389539954095419542954395449545954695479548954995509551955295539554955595569557955895599560956195629563956495659566956795689569957095719572957395749575957695779578957995809581958295839584958595869587958895899590959195929593959495959596959795989599960096019602960396049605960696079608960996109611961296139614961596169617961896199620962196229623962496259626962796289629963096319632963396349635963696379638963996409641964296439644964596469647964896499650965196529653965496559656965796589659966096619662966396649665966696679668966996709671967296739674967596769677967896799680968196829683968496859686968796889689969096919692969396949695969696979698969997009701970297039704970597069707970897099710971197129713971497159716971797189719972097219722972397249725972697279728972997309731973297339734973597369737973897399740974197429743974497459746974797489749975097519752975397549755975697579758975997609761976297639764976597669767976897699770977197729773977497759776977797789779978097819782978397849785978697879788978997909791979297939794979597969797979897999800980198029803980498059806980798089809981098119812981398149815981698179818981998209821982298239824982598269827982898299830983198329833983498359836983798389839984098419842984398449845984698479848984998509851985298539854985598569857985898599860986198629863986498659866986798689869987098719872987398749875987698779878987998809881988298839884988598869887988898899890989198929893989498959896989798989899990099019902990399049905990699079908990999109911991299139914991599169917991899199920992199229923992499259926992799289929993099319932993399349935993699379938993999409941994299439944994599469947994899499950995199529953995499559956995799589959996099619962996399649965996699679968996999709971997299739974997599769977997899799980998199829983998499859986998799889989999099919992999399949995999699979998999910000100011000210003100041000510006100071000810009100101001110012100131001410015100161001710018100191002010021100221002310024100251002610027100281002910030100311003210033100341003510036100371003810039100401004110042100431004410045100461004710048100491005010051100521005310054100551005610057100581005910060100611006210063100641006510066100671006810069100701007110072100731007410075100761007710078100791008010081100821008310084100851008610087100881008910090100911009210093100941009510096100971009810099101001010110102101031010410105101061010710108101091011010111101121011310114101151011610117101181011910120101211012210123101241012510126101271012810129101301013110132101331013410135101361013710138101391014010141101421014310144101451014610147101481014910150101511015210153101541015510156101571015810159101601016110162101631016410165101661016710168101691017010171101721017310174101751017610177101781017910180101811018210183101841018510186101871018810189101901019110192101931019410195101961019710198101991020010201102021020310204102051020610207102081020910210102111021210213102141021510216102171021810219102201022110222102231022410225102261022710228102291023010231102321023310234102351023610237102381023910240102411024210243102441024510246102471024810249102501025110252102531025410255102561025710258102591026010261102621026310264102651026610267102681026910270102711027210273102741027510276102771027810279102801028110282102831028410285102861028710288102891029010291102921029310294102951029610297102981029910300103011030210303103041030510306103071030810309103101031110312103131031410315103161031710318103191032010321103221032310324103251032610327103281032910330103311033210333103341033510336103371033810339103401034110342103431034410345103461034710348103491035010351103521035310354103551035610357103581035910360103611036210363103641036510366103671036810369103701037110372103731037410375103761037710378103791038010381103821038310384103851038610387103881038910390103911039210393103941039510396103971039810399104001040110402104031040410405104061040710408104091041010411104121041310414104151041610417104181041910420104211042210423104241042510426104271042810429104301043110432104331043410435104361043710438104391044010441104421044310444104451044610447104481044910450104511045210453104541045510456104571045810459104601046110462104631046410465104661046710468104691047010471104721047310474104751047610477104781047910480104811048210483104841048510486104871048810489104901049110492104931049410495104961049710498104991050010501105021050310504105051050610507105081050910510105111051210513105141051510516105171051810519105201052110522105231052410525105261052710528105291053010531105321053310534105351053610537105381053910540105411054210543105441054510546105471054810549105501055110552105531055410555105561055710558105591056010561105621056310564105651056610567105681056910570105711057210573105741057510576105771057810579105801058110582105831058410585105861058710588105891059010591105921059310594105951059610597105981059910600106011060210603106041060510606106071060810609106101061110612106131061410615106161061710618106191062010621106221062310624106251062610627106281062910630106311063210633106341063510636106371063810639106401064110642106431064410645106461064710648106491065010651106521065310654106551065610657106581065910660106611066210663106641066510666106671066810669106701067110672106731067410675106761067710678106791068010681106821068310684106851068610687106881068910690106911069210693106941069510696106971069810699107001070110702107031070410705107061070710708107091071010711107121071310714107151071610717107181071910720107211072210723107241072510726107271072810729107301073110732107331073410735107361073710738107391074010741107421074310744107451074610747107481074910750107511075210753107541075510756107571075810759107601076110762107631076410765107661076710768107691077010771107721077310774107751077610777107781077910780107811078210783107841078510786107871078810789107901079110792107931079410795107961079710798107991080010801108021080310804108051080610807108081080910810108111081210813108141081510816108171081810819108201082110822108231082410825108261082710828108291083010831108321083310834108351083610837108381083910840108411084210843108441084510846108471084810849108501085110852108531085410855108561085710858108591086010861108621086310864108651086610867108681086910870108711087210873108741087510876108771087810879108801088110882108831088410885108861088710888108891089010891108921089310894108951089610897108981089910900109011090210903109041090510906109071090810909109101091110912109131091410915109161091710918109191092010921109221092310924109251092610927109281092910930109311093210933109341093510936109371093810939109401094110942109431094410945109461094710948109491095010951109521095310954109551095610957109581095910960109611096210963109641096510966109671096810969109701097110972109731097410975109761097710978109791098010981109821098310984109851098610987109881098910990109911099210993109941099510996109971099810999110001100111002110031100411005110061100711008110091101011011110121101311014110151101611017110181101911020110211102211023110241102511026110271102811029110301103111032110331103411035110361103711038110391104011041110421104311044110451104611047110481104911050110511105211053110541105511056110571105811059110601106111062110631106411065110661106711068110691107011071110721107311074110751107611077110781107911080110811108211083110841108511086110871108811089110901109111092110931109411095110961109711098110991110011101111021110311104111051110611107111081110911110111111111211113111141111511116111171111811119111201112111122111231112411125111261112711128111291113011131111321113311134111351113611137111381113911140111411114211143111441114511146111471114811149111501115111152111531115411155111561115711158111591116011161111621116311164111651116611167111681116911170111711117211173111741117511176111771117811179111801118111182111831118411185111861118711188111891119011191111921119311194111951119611197111981119911200112011120211203112041120511206112071120811209112101121111212112131121411215112161121711218112191122011221112221122311224112251122611227112281122911230112311123211233112341123511236112371123811239112401124111242112431124411245112461124711248112491125011251112521125311254112551125611257112581125911260112611126211263112641126511266112671126811269112701127111272112731127411275112761127711278112791128011281112821128311284112851128611287112881128911290112911129211293112941129511296112971129811299113001130111302113031130411305113061130711308113091131011311113121131311314113151131611317113181131911320113211132211323113241132511326113271132811329113301133111332113331133411335113361133711338113391134011341113421134311344113451134611347113481134911350113511135211353113541135511356113571135811359113601136111362113631136411365113661136711368113691137011371113721137311374113751137611377113781137911380113811138211383113841138511386113871138811389113901139111392113931139411395113961139711398113991140011401114021140311404114051140611407114081140911410114111141211413114141141511416114171141811419114201142111422114231142411425114261142711428114291143011431114321143311434114351143611437114381143911440114411144211443114441144511446114471144811449114501145111452114531145411455114561145711458114591146011461114621146311464114651146611467114681146911470114711147211473114741147511476114771147811479114801148111482114831148411485114861148711488114891149011491114921149311494114951149611497114981149911500115011150211503115041150511506115071150811509115101151111512115131151411515115161151711518115191152011521115221152311524115251152611527115281152911530115311153211533115341153511536115371153811539115401154111542115431154411545115461154711548115491155011551115521155311554115551155611557115581155911560115611156211563115641156511566115671156811569115701157111572115731157411575115761157711578115791158011581115821158311584115851158611587115881158911590115911159211593115941159511596115971159811599116001160111602116031160411605116061160711608116091161011611116121161311614116151161611617116181161911620116211162211623116241162511626116271162811629116301163111632116331163411635116361163711638116391164011641116421164311644116451164611647116481164911650116511165211653116541165511656116571165811659116601166111662116631166411665116661166711668116691167011671116721167311674116751167611677116781167911680116811168211683116841168511686116871168811689116901169111692116931169411695116961169711698116991170011701117021170311704117051170611707117081170911710117111171211713117141171511716117171171811719117201172111722117231172411725117261172711728117291173011731117321173311734117351173611737117381173911740117411174211743117441174511746117471174811749117501175111752117531175411755117561175711758117591176011761117621176311764117651176611767117681176911770117711177211773117741177511776117771177811779117801178111782117831178411785117861178711788117891179011791117921179311794117951179611797117981179911800118011180211803118041180511806118071180811809118101181111812118131181411815118161181711818118191182011821118221182311824118251182611827118281182911830118311183211833118341183511836118371183811839118401184111842118431184411845118461184711848118491185011851118521185311854118551185611857118581185911860118611186211863118641186511866118671186811869118701187111872118731187411875118761187711878118791188011881118821188311884118851188611887118881188911890118911189211893118941189511896118971189811899119001190111902119031190411905119061190711908119091191011911119121191311914119151191611917119181191911920119211192211923119241192511926119271192811929119301193111932119331193411935119361193711938119391194011941119421194311944119451194611947119481194911950119511195211953119541195511956119571195811959119601196111962119631196411965119661196711968119691197011971119721197311974119751197611977119781197911980119811198211983119841198511986119871198811989119901199111992119931199411995119961199711998119991200012001120021200312004120051200612007120081200912010120111201212013120141201512016120171201812019120201202112022120231202412025120261202712028120291203012031120321203312034120351203612037120381203912040120411204212043120441204512046120471204812049120501205112052120531205412055120561205712058120591206012061120621206312064120651206612067120681206912070120711207212073120741207512076120771207812079120801208112082120831208412085120861208712088120891209012091120921209312094120951209612097120981209912100121011210212103121041210512106121071210812109121101211112112121131211412115121161211712118121191212012121121221212312124121251212612127121281212912130121311213212133121341213512136121371213812139121401214112142121431214412145121461214712148121491215012151121521215312154121551215612157121581215912160121611216212163121641216512166121671216812169121701217112172121731217412175121761217712178121791218012181121821218312184121851218612187121881218912190121911219212193121941219512196121971219812199122001220112202122031220412205122061220712208122091221012211122121221312214122151221612217122181221912220122211222212223122241222512226122271222812229122301223112232122331223412235122361223712238122391224012241122421224312244122451224612247122481224912250122511225212253122541225512256122571225812259122601226112262122631226412265122661226712268122691227012271122721227312274122751227612277122781227912280122811228212283122841228512286122871228812289122901229112292122931229412295122961229712298122991230012301123021230312304123051230612307123081230912310123111231212313123141231512316123171231812319123201232112322123231232412325123261232712328123291233012331123321233312334123351233612337123381233912340123411234212343123441234512346123471234812349123501235112352123531235412355123561235712358123591236012361123621236312364123651236612367123681236912370123711237212373123741237512376123771237812379123801238112382123831238412385123861238712388123891239012391123921239312394123951239612397123981239912400124011240212403124041240512406124071240812409124101241112412124131241412415124161241712418124191242012421124221242312424124251242612427124281242912430124311243212433124341243512436124371243812439124401244112442124431244412445124461244712448124491245012451124521245312454124551245612457124581245912460124611246212463124641246512466124671246812469124701247112472124731247412475124761247712478124791248012481124821248312484124851248612487124881248912490124911249212493124941249512496124971249812499125001250112502125031250412505125061250712508125091251012511125121251312514125151251612517125181251912520125211252212523125241252512526125271252812529125301253112532125331253412535125361253712538125391254012541125421254312544125451254612547125481254912550125511255212553125541255512556125571255812559125601256112562125631256412565125661256712568125691257012571125721257312574125751257612577125781257912580125811258212583125841258512586125871258812589125901259112592125931259412595125961259712598125991260012601126021260312604126051260612607126081260912610126111261212613126141261512616126171261812619126201262112622126231262412625126261262712628126291263012631126321263312634126351263612637126381263912640126411264212643126441264512646126471264812649126501265112652126531265412655126561265712658126591266012661126621266312664126651266612667126681266912670126711267212673126741267512676126771267812679126801268112682126831268412685126861268712688126891269012691126921269312694126951269612697126981269912700127011270212703127041270512706127071270812709127101271112712127131271412715127161271712718127191272012721127221272312724127251272612727127281272912730
  1. #include "llama-model.h"
  2. #include "llama-impl.h"
  3. #include "llama-mmap.h"
  4. #include "llama-batch.h"
  5. #include "llama-cparams.h"
  6. #include "llama-model-loader.h"
  7. #include "llama-kv-cache.h"
  8. #include "ggml-cpp.h"
  9. #include <algorithm>
  10. #include <cassert>
  11. #include <cmath>
  12. #include <cfloat>
  13. #include <cstring>
  14. #include <cmath>
  15. #include <functional>
  16. #include <map>
  17. #include <regex>
  18. #include <sstream>
  19. #include <stdexcept>
  20. const char * llm_type_name(llm_type type) {
  21. switch (type) {
  22. case LLM_TYPE_14M: return "14M";
  23. case LLM_TYPE_17M: return "17M";
  24. case LLM_TYPE_22M: return "22M";
  25. case LLM_TYPE_33M: return "33M";
  26. case LLM_TYPE_60M: return "60M";
  27. case LLM_TYPE_70M: return "70M";
  28. case LLM_TYPE_80M: return "80M";
  29. case LLM_TYPE_109M: return "109M";
  30. case LLM_TYPE_137M: return "137M";
  31. case LLM_TYPE_160M: return "160M";
  32. case LLM_TYPE_190M: return "190M";
  33. case LLM_TYPE_220M: return "220M";
  34. case LLM_TYPE_250M: return "250M";
  35. case LLM_TYPE_270M: return "270M";
  36. case LLM_TYPE_335M: return "335M";
  37. case LLM_TYPE_410M: return "410M";
  38. case LLM_TYPE_450M: return "450M";
  39. case LLM_TYPE_770M: return "770M";
  40. case LLM_TYPE_780M: return "780M";
  41. case LLM_TYPE_0_5B: return "0.5B";
  42. case LLM_TYPE_1B: return "1B";
  43. case LLM_TYPE_1_3B: return "1.3B";
  44. case LLM_TYPE_1_4B: return "1.4B";
  45. case LLM_TYPE_1_5B: return "1.5B";
  46. case LLM_TYPE_1_6B: return "1.6B";
  47. case LLM_TYPE_1_8B: return "1.8B";
  48. case LLM_TYPE_2B: return "2B";
  49. case LLM_TYPE_2_8B: return "2.8B";
  50. case LLM_TYPE_2_9B: return "2.9B";
  51. case LLM_TYPE_3B: return "3B";
  52. case LLM_TYPE_4B: return "4B";
  53. case LLM_TYPE_6B: return "6B";
  54. case LLM_TYPE_6_9B: return "6.9B";
  55. case LLM_TYPE_7B: return "7B";
  56. case LLM_TYPE_8B: return "8B";
  57. case LLM_TYPE_9B: return "9B";
  58. case LLM_TYPE_11B: return "11B";
  59. case LLM_TYPE_12B: return "12B";
  60. case LLM_TYPE_13B: return "13B";
  61. case LLM_TYPE_14B: return "14B";
  62. case LLM_TYPE_15B: return "15B";
  63. case LLM_TYPE_16B: return "16B";
  64. case LLM_TYPE_20B: return "20B";
  65. case LLM_TYPE_30B: return "30B";
  66. case LLM_TYPE_32B: return "32B";
  67. case LLM_TYPE_34B: return "34B";
  68. case LLM_TYPE_35B: return "35B";
  69. case LLM_TYPE_40B: return "40B";
  70. case LLM_TYPE_65B: return "65B";
  71. case LLM_TYPE_70B: return "70B";
  72. case LLM_TYPE_236B: return "236B";
  73. case LLM_TYPE_314B: return "314B";
  74. case LLM_TYPE_671B: return "671B";
  75. case LLM_TYPE_SMALL: return "0.1B";
  76. case LLM_TYPE_MEDIUM: return "0.4B";
  77. case LLM_TYPE_LARGE: return "0.8B";
  78. case LLM_TYPE_XL: return "1.5B";
  79. case LLM_TYPE_A1_7B: return "A1.7B";
  80. case LLM_TYPE_A2_7B: return "A2.7B";
  81. case LLM_TYPE_8x7B: return "8x7B";
  82. case LLM_TYPE_8x22B: return "8x22B";
  83. case LLM_TYPE_16x12B: return "16x12B";
  84. case LLM_TYPE_16x3_8B: return "16x3.8B";
  85. case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
  86. case LLM_TYPE_57B_A14B: return "57B.A14B";
  87. case LLM_TYPE_27B: return "27B";
  88. case LLM_TYPE_290B: return "290B";
  89. case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
  90. case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
  91. default: return "?B";
  92. }
  93. }
  94. static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
  95. switch (type) {
  96. case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
  97. case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
  98. default: return "unknown";
  99. }
  100. }
  101. static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
  102. { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
  103. { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
  104. { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
  105. { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
  106. };
  107. static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
  108. for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
  109. if (kv.second == name) {
  110. return (llama_rope_scaling_type) kv.first;
  111. }
  112. }
  113. return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
  114. }
  115. // checks if the weight tensor can be used with the specified buffer type and device
  116. static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
  117. GGML_ASSERT(w != nullptr);
  118. if (op == GGML_OP_NONE) {
  119. return true;
  120. }
  121. ggml_init_params params = {
  122. /*.mem_size =*/ ggml_tensor_overhead()*8,
  123. /*.mem_buffer =*/ NULL,
  124. /*.no_alloc =*/ true,
  125. };
  126. ggml_context_ptr ctx_ptr { ggml_init(params) };
  127. if (!ctx_ptr) {
  128. throw std::runtime_error(format("failed to create ggml context"));
  129. }
  130. ggml_context * ctx = ctx_ptr.get();
  131. ggml_tensor * op_tensor = nullptr;
  132. switch (op) {
  133. case GGML_OP_GET_ROWS:
  134. {
  135. ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
  136. op_tensor = ggml_get_rows(ctx, w, b);
  137. } break;
  138. case GGML_OP_MUL_MAT:
  139. {
  140. ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
  141. op_tensor = ggml_mul_mat(ctx, w, b);
  142. } break;
  143. case GGML_OP_MUL_MAT_ID:
  144. {
  145. int n_expert_used = hparams.n_expert_used;
  146. ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
  147. ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
  148. op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
  149. } break;
  150. case GGML_OP_ADD:
  151. {
  152. ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
  153. op_tensor = ggml_add(ctx, a, w);
  154. } break;
  155. case GGML_OP_MUL:
  156. {
  157. ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
  158. op_tensor = ggml_mul(ctx, a, w);
  159. } break;
  160. case GGML_OP_DIV:
  161. {
  162. ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
  163. op_tensor = ggml_div(ctx, a, w);
  164. } break;
  165. case GGML_OP_ROPE:
  166. {
  167. int n_embd_head = hparams.n_embd_head_v;
  168. int n_head = hparams.n_head();
  169. ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
  170. ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
  171. op_tensor = ggml_rope_ext(
  172. ctx, a, b, w,
  173. 0, 0, 0, 0, 0,
  174. 0, 0, 0, 0
  175. );
  176. } break;
  177. case GGML_OP_SSM_CONV:
  178. {
  179. // FIXME
  180. ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 12345, w->ne[1], 6789);
  181. op_tensor = ggml_ssm_conv(ctx, conv_x, w);
  182. } break;
  183. case GGML_OP_SSM_SCAN:
  184. {
  185. // FIXME
  186. const int64_t d_state = w->ne[0];
  187. const int64_t d_inner = w->ne[1];
  188. const int64_t n_seq_tokens = 512;
  189. const int64_t n_seqs = 1;
  190. ggml_tensor * s = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, d_inner, n_seqs);
  191. ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
  192. ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
  193. ggml_tensor * B = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
  194. ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
  195. op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C);
  196. } break;
  197. case GGML_OP_RWKV_WKV6:
  198. {
  199. // FIXME
  200. const int64_t S = 123;
  201. const int64_t H = 123;
  202. const int64_t n_tokens = 123;
  203. const int64_t n_seqs = 123;
  204. ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  205. ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  206. ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  207. ggml_tensor * tf = w;
  208. ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  209. ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
  210. op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
  211. } break;
  212. case GGML_OP_IM2COL:
  213. {
  214. const int n_embd = hparams.n_embd;
  215. ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
  216. op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
  217. } break;
  218. default:
  219. GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
  220. }
  221. // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
  222. GGML_ASSERT(w->buffer == nullptr);
  223. w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
  224. bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  225. ggml_backend_buffer_free(w->buffer);
  226. w->buffer = nullptr;
  227. return op_supported;
  228. }
  229. // lists of buffer types used for each layer
  230. using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
  231. // find the first buffer type in the list that can use the tensor
  232. static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
  233. GGML_ASSERT(!buft_list.empty());
  234. for (const auto & cur : buft_list) {
  235. ggml_backend_dev_t cur_dev = cur.first;
  236. ggml_backend_buffer_type_t cur_buft = cur.second;
  237. if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
  238. return cur_buft;
  239. }
  240. }
  241. return nullptr;
  242. }
  243. // CPU: ACCEL -> GPU host -> CPU extra -> CPU
  244. static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
  245. buft_list_t buft_list;
  246. // add ACCEL buffer types
  247. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  248. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  249. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
  250. auto * buft = ggml_backend_dev_buffer_type(dev);
  251. // skip
  252. if (buft != ggml_backend_cpu_buffer_type()) {
  253. buft_list.emplace_back(dev, buft);
  254. }
  255. }
  256. }
  257. // add a host buffer type
  258. // storing the tensors in a host buffer is useful when the processing of large batches
  259. // is offloaded to a GPU device, since it reduces the time spent on data transfers
  260. // generally, this will be done using the first device in the list
  261. // a better approach would be to handle this on a weight-by-weight basis using the offload_op
  262. // function of the device to determine if it would benefit from being stored in a host buffer
  263. for (auto * dev : devices) {
  264. ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
  265. if (buft) {
  266. buft_list.emplace_back(dev, buft);
  267. break;
  268. }
  269. }
  270. // add extra buffer types, only if no GPU device is present
  271. // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
  272. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  273. auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
  274. auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
  275. ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
  276. if (ggml_backend_dev_get_extra_bufts_fn) {
  277. ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
  278. while (extra_bufts && *extra_bufts) {
  279. buft_list.emplace_back(cpu_dev, *extra_bufts);
  280. ++extra_bufts;
  281. }
  282. }
  283. // add the CPU buffer type
  284. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  285. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  286. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
  287. buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
  288. }
  289. }
  290. return buft_list;
  291. }
  292. // GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
  293. static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode split_mode, const float * tensor_split) {
  294. buft_list_t buft_list;
  295. // add the device split buffer type if requested and available
  296. if (split_mode == LLAMA_SPLIT_MODE_ROW) {
  297. ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
  298. auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
  299. ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
  300. if (ggml_backend_split_buffer_type_fn) {
  301. size_t dev_index = [&]() {
  302. auto * reg = ggml_backend_dev_backend_reg(dev);
  303. for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
  304. if (ggml_backend_reg_dev_get(reg, i) == dev) {
  305. return i;
  306. }
  307. }
  308. throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
  309. }();
  310. auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
  311. if (buft != nullptr) {
  312. buft_list.emplace_back(dev, buft);
  313. }
  314. }
  315. }
  316. // add the device default buffer type
  317. buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
  318. return buft_list;
  319. }
  320. struct llama_model::impl {
  321. impl() {}
  322. ~impl() {}
  323. uint64_t n_elements = 0;
  324. size_t n_bytes = 0;
  325. std::string desc_str;
  326. // model memory mapped files
  327. llama_mmaps mappings;
  328. // objects representing data potentially being locked in memory
  329. llama_mlocks mlock_bufs;
  330. llama_mlocks mlock_mmaps;
  331. // contexts where the model tensors metadata is stored
  332. std::vector<ggml_context_ptr> ctxs;
  333. // the model memory buffers for the tensor data
  334. std::vector<ggml_backend_buffer_ptr> bufs;
  335. buft_list_t cpu_buft_list;
  336. std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
  337. struct layer_dev {
  338. ggml_backend_dev_t dev;
  339. buft_list_t * buft_list;
  340. };
  341. layer_dev dev_input = {};
  342. layer_dev dev_output = {};
  343. std::vector<layer_dev> dev_layer;
  344. bool has_tensor_overrides;
  345. };
  346. llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
  347. pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
  348. }
  349. llama_model::~llama_model() {}
  350. void llama_model::load_stats(llama_model_loader & ml) {
  351. pimpl->n_elements = ml.n_elements;
  352. pimpl->n_bytes = ml.n_bytes;
  353. }
  354. void llama_model::load_arch(llama_model_loader & ml) {
  355. arch = ml.get_arch();
  356. if (arch == LLM_ARCH_UNKNOWN) {
  357. throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
  358. }
  359. }
  360. void llama_model::load_hparams(llama_model_loader & ml) {
  361. const gguf_context * ctx = ml.meta.get();
  362. // get metadata as string
  363. for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
  364. gguf_type type = gguf_get_kv_type(ctx, i);
  365. if (type == GGUF_TYPE_ARRAY) {
  366. continue;
  367. }
  368. const char * name = gguf_get_key(ctx, i);
  369. const std::string value = gguf_kv_to_str(ctx, i);
  370. gguf_kv.emplace(name, value);
  371. }
  372. // get general kv
  373. ml.get_key(LLM_KV_GENERAL_NAME, name, false);
  374. // everything past this point is not vocab-related
  375. if (hparams.vocab_only) {
  376. return;
  377. }
  378. ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
  379. ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
  380. ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
  381. ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
  382. ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
  383. if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
  384. ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
  385. ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
  386. ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
  387. ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
  388. ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
  389. }
  390. GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
  391. GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
  392. if (hparams.n_expert > 0) {
  393. GGML_ASSERT(hparams.n_expert_used > 0);
  394. } else {
  395. GGML_ASSERT(hparams.n_expert_used == 0);
  396. }
  397. // zero-out the array hparams
  398. std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
  399. std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
  400. std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
  401. ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
  402. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
  403. // n_head_kv is optional, default to n_head
  404. hparams.n_head_kv_arr = hparams.n_head_arr;
  405. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
  406. bool rope_finetuned = false;
  407. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  408. hparams.rope_finetuned = rope_finetuned;
  409. hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
  410. ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
  411. // rope_freq_base (optional)
  412. hparams.rope_freq_base_train = 10000.0f;
  413. ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
  414. std::string rope_scaling("linear");
  415. ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
  416. hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
  417. GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
  418. // rope_freq_scale (inverse of the kv) is optional
  419. float ropescale = 0.0f;
  420. if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
  421. // try the old key name
  422. ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
  423. }
  424. hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
  425. // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
  426. hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
  427. hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
  428. ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
  429. // non-transformer models do not have attention heads
  430. if (hparams.n_head() > 0) {
  431. // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
  432. // gpt-j n_rot = rotary_dim
  433. hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
  434. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
  435. hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
  436. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
  437. // sanity check for n_rot (optional)
  438. hparams.n_rot = hparams.n_embd_head_k;
  439. ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
  440. if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
  441. if (hparams.n_rot != hparams.n_embd_head_k) {
  442. throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
  443. }
  444. }
  445. } else {
  446. hparams.n_rot = 0;
  447. hparams.n_embd_head_k = 0;
  448. hparams.n_embd_head_v = 0;
  449. }
  450. // for differentiating model types
  451. uint32_t n_vocab = 0;
  452. ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
  453. // arch-specific KVs
  454. switch (arch) {
  455. case LLM_ARCH_LLAMA:
  456. {
  457. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  458. if (hparams.n_expert == 8) {
  459. switch (hparams.n_layer) {
  460. case 32: type = LLM_TYPE_8x7B; break;
  461. case 56: type = LLM_TYPE_8x22B; break;
  462. default: type = LLM_TYPE_UNKNOWN;
  463. }
  464. } else {
  465. switch (hparams.n_layer) {
  466. case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
  467. case 22: type = LLM_TYPE_1B; break;
  468. case 26: type = LLM_TYPE_3B; break;
  469. case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
  470. // granite uses a vocab with len 49152
  471. case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
  472. case 36: type = LLM_TYPE_8B; break; // granite
  473. case 40: type = LLM_TYPE_13B; break;
  474. case 48: type = LLM_TYPE_34B; break;
  475. case 60: type = LLM_TYPE_30B; break;
  476. case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
  477. default: type = LLM_TYPE_UNKNOWN;
  478. }
  479. }
  480. } break;
  481. case LLM_ARCH_LLAMA4:
  482. {
  483. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  484. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  485. ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
  486. hparams.n_swa_pattern = 4; // pattern: 3 chunked - 1 full
  487. hparams.n_attn_chunk = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
  488. hparams.n_swa = 1; // TODO @ngxson : this is added to trigger the SWA branch (we store the chunked attn mask in the SWA tensor), will need to clean this up later
  489. switch (hparams.n_expert) {
  490. case 16: type = LLM_TYPE_17B_16E; break;
  491. case 128: type = LLM_TYPE_17B_128E; break;
  492. default: type = LLM_TYPE_UNKNOWN;
  493. }
  494. if (type == LLM_TYPE_17B_128E) {
  495. hparams.use_kq_norm = false;
  496. }
  497. } break;
  498. case LLM_ARCH_DECI:
  499. {
  500. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  501. switch (hparams.n_layer) {
  502. case 32: type = LLM_TYPE_7B; break;
  503. case 80: type = LLM_TYPE_70B; break;
  504. default: type = LLM_TYPE_UNKNOWN;
  505. }
  506. } break;
  507. case LLM_ARCH_MINICPM:
  508. {
  509. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  510. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  511. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  512. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  513. switch (hparams.n_layer) {
  514. case 52: type = LLM_TYPE_1B; break;
  515. case 40: type = LLM_TYPE_2B; break;
  516. default: type = LLM_TYPE_UNKNOWN;
  517. }
  518. } break;
  519. case LLM_ARCH_MINICPM3:
  520. {
  521. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  522. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  523. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  524. switch (hparams.n_layer) {
  525. case 62: type = LLM_TYPE_4B; break;
  526. default: type = LLM_TYPE_UNKNOWN;
  527. }
  528. } break;
  529. case LLM_ARCH_GROK:
  530. {
  531. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  532. switch (hparams.n_layer) {
  533. case 64: type = LLM_TYPE_314B; break;
  534. default: type = LLM_TYPE_UNKNOWN;
  535. }
  536. } break;
  537. case LLM_ARCH_FALCON:
  538. {
  539. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  540. switch (hparams.n_layer) {
  541. case 32: type = LLM_TYPE_7B; break;
  542. case 60: type = LLM_TYPE_40B; break;
  543. default: type = LLM_TYPE_UNKNOWN;
  544. }
  545. } break;
  546. case LLM_ARCH_BAICHUAN:
  547. {
  548. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  549. switch (hparams.n_layer) {
  550. case 32: type = LLM_TYPE_7B; break;
  551. case 40: type = LLM_TYPE_13B; break;
  552. default: type = LLM_TYPE_UNKNOWN;
  553. }
  554. if (type == LLM_TYPE_13B) {
  555. // TODO: become GGUF KV parameter
  556. hparams.f_max_alibi_bias = 8.0f;
  557. }
  558. } break;
  559. case LLM_ARCH_STARCODER:
  560. {
  561. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  562. switch (hparams.n_layer) {
  563. case 24: type = LLM_TYPE_1B; break;
  564. case 36: type = LLM_TYPE_3B; break;
  565. case 42: type = LLM_TYPE_7B; break;
  566. case 40: type = LLM_TYPE_15B; break;
  567. default: type = LLM_TYPE_UNKNOWN;
  568. }
  569. } break;
  570. case LLM_ARCH_REFACT:
  571. {
  572. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  573. switch (hparams.n_layer) {
  574. case 32: type = LLM_TYPE_1B; break;
  575. default: type = LLM_TYPE_UNKNOWN;
  576. }
  577. // TODO: become GGUF KV parameter
  578. hparams.f_max_alibi_bias = 8.0f;
  579. } break;
  580. case LLM_ARCH_BERT:
  581. {
  582. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  583. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  584. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  585. switch (hparams.n_layer) {
  586. case 3:
  587. type = LLM_TYPE_17M; break; // bge-micro
  588. case 6:
  589. type = LLM_TYPE_22M; break; // MiniLM-L6
  590. case 12:
  591. switch (hparams.n_embd) {
  592. case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
  593. case 768: type = LLM_TYPE_109M; break; // bge-base
  594. default: type = LLM_TYPE_UNKNOWN;
  595. } break;
  596. case 24:
  597. type = LLM_TYPE_335M; break; // bge-large
  598. default: type = LLM_TYPE_UNKNOWN;
  599. }
  600. } break;
  601. case LLM_ARCH_JINA_BERT_V2:
  602. {
  603. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  604. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  605. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  606. hparams.f_max_alibi_bias = 8.0f;
  607. switch (hparams.n_layer) {
  608. case 4: type = LLM_TYPE_33M; break; // jina-embeddings-small
  609. case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
  610. default: type = LLM_TYPE_UNKNOWN;
  611. }
  612. } break;
  613. case LLM_ARCH_NOMIC_BERT:
  614. {
  615. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  616. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  617. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  618. if (hparams.n_layer == 12 && hparams.n_embd == 768) {
  619. type = LLM_TYPE_137M;
  620. }
  621. } break;
  622. case LLM_ARCH_BLOOM:
  623. {
  624. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  625. switch (hparams.n_layer) {
  626. case 24: type = LLM_TYPE_1B; break;
  627. case 30:
  628. switch (hparams.n_embd) {
  629. case 2560: type = LLM_TYPE_3B; break;
  630. case 4096: type = LLM_TYPE_7B; break;
  631. default: type = LLM_TYPE_UNKNOWN;
  632. } break;
  633. default: type = LLM_TYPE_UNKNOWN;
  634. }
  635. // TODO: become GGUF KV parameter
  636. hparams.f_max_alibi_bias = 8.0f;
  637. } break;
  638. case LLM_ARCH_MPT:
  639. {
  640. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  641. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  642. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  643. switch (hparams.n_layer) {
  644. case 32: type = LLM_TYPE_7B; break;
  645. case 48: type = LLM_TYPE_30B; break;
  646. default: type = LLM_TYPE_UNKNOWN;
  647. }
  648. } break;
  649. case LLM_ARCH_STABLELM:
  650. {
  651. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  652. switch (hparams.n_layer) {
  653. case 24: type = LLM_TYPE_1B; break;
  654. case 32: type = LLM_TYPE_3B; break;
  655. case 40: type = LLM_TYPE_12B; break;
  656. default: type = LLM_TYPE_UNKNOWN;
  657. }
  658. } break;
  659. case LLM_ARCH_QWEN:
  660. {
  661. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  662. switch (hparams.n_layer) {
  663. case 32: type = LLM_TYPE_7B; break;
  664. case 40: type = LLM_TYPE_13B; break;
  665. default: type = LLM_TYPE_UNKNOWN;
  666. }
  667. } break;
  668. case LLM_ARCH_QWEN2VL:
  669. {
  670. ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
  671. }
  672. // fall through
  673. case LLM_ARCH_QWEN2:
  674. {
  675. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  676. switch (hparams.n_layer) {
  677. case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
  678. case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
  679. case 32: type = LLM_TYPE_7B; break;
  680. case 36: type = LLM_TYPE_3B; break;
  681. case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
  682. case 48: type = LLM_TYPE_14B; break;
  683. case 64: type = LLM_TYPE_32B; break;
  684. case 80: type = LLM_TYPE_70B; break;
  685. default: type = LLM_TYPE_UNKNOWN;
  686. }
  687. } break;
  688. case LLM_ARCH_QWEN2MOE:
  689. {
  690. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  691. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  692. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  693. switch (hparams.n_layer) {
  694. case 24: type = LLM_TYPE_A2_7B; break;
  695. case 28: type = LLM_TYPE_57B_A14B; break;
  696. default: type = LLM_TYPE_UNKNOWN;
  697. }
  698. } break;
  699. case LLM_ARCH_PHI2:
  700. {
  701. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  702. switch (hparams.n_layer) {
  703. case 24: type = LLM_TYPE_1B; break;
  704. case 32: type = LLM_TYPE_3B; break;
  705. default: type = LLM_TYPE_UNKNOWN;
  706. }
  707. } break;
  708. case LLM_ARCH_PHI3:
  709. {
  710. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  711. switch (hparams.n_layer) {
  712. case 24: type = LLM_TYPE_1B; break;
  713. case 32: type = LLM_TYPE_3B; break;
  714. case 40: type = LLM_TYPE_14B; break;
  715. default: type = LLM_TYPE_UNKNOWN;
  716. }
  717. // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
  718. if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
  719. // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
  720. hparams.n_swa = 2047;
  721. } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
  722. // default value for Phi-3-mini-128k-instruct
  723. // note: this seems incorrect because the window is bigger than the train context?
  724. hparams.n_swa = 262144;
  725. } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
  726. // default value for Phi-3-medium-128k-instruct
  727. // note: this seems incorrect because the window is equal to the train context?
  728. hparams.n_swa = 131072;
  729. }
  730. bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  731. if (!found_swa && hparams.n_swa == 0) {
  732. throw std::runtime_error("invalid value for sliding_window");
  733. }
  734. } break;
  735. case LLM_ARCH_PHIMOE:
  736. {
  737. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  738. switch (hparams.n_layer) {
  739. case 32: type = LLM_TYPE_16x3_8B; break;
  740. default: type = LLM_TYPE_UNKNOWN;
  741. }
  742. } break;
  743. case LLM_ARCH_PLAMO:
  744. {
  745. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  746. switch (hparams.n_layer) {
  747. case 40: type = LLM_TYPE_13B; break;
  748. default: type = LLM_TYPE_UNKNOWN;
  749. }
  750. } break;
  751. case LLM_ARCH_GPT2:
  752. {
  753. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  754. switch (hparams.n_layer) {
  755. case 12: type = LLM_TYPE_SMALL; break;
  756. case 24: type = LLM_TYPE_MEDIUM; break;
  757. case 36: type = LLM_TYPE_LARGE; break;
  758. case 48: type = LLM_TYPE_XL; break;
  759. default: type = LLM_TYPE_UNKNOWN;
  760. }
  761. } break;
  762. case LLM_ARCH_CODESHELL:
  763. {
  764. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  765. switch (hparams.n_layer) {
  766. case 42: type = LLM_TYPE_7B; break;
  767. default: type = LLM_TYPE_UNKNOWN;
  768. }
  769. } break;
  770. case LLM_ARCH_ORION:
  771. {
  772. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  773. switch (hparams.n_layer) {
  774. case 40: type = LLM_TYPE_14B; break;
  775. default: type = LLM_TYPE_UNKNOWN;
  776. }
  777. } break;
  778. case LLM_ARCH_INTERNLM2:
  779. {
  780. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  781. switch (hparams.n_layer) {
  782. case 32: type = LLM_TYPE_7B; break;
  783. case 48: type = LLM_TYPE_20B; break;
  784. default: type = LLM_TYPE_UNKNOWN;
  785. }
  786. } break;
  787. case LLM_ARCH_GEMMA:
  788. {
  789. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  790. switch (hparams.n_layer) {
  791. case 18: type = LLM_TYPE_2B; break;
  792. case 28: type = LLM_TYPE_7B; break;
  793. default: type = LLM_TYPE_UNKNOWN;
  794. }
  795. } break;
  796. case LLM_ARCH_GEMMA2:
  797. {
  798. hparams.n_swa = 4096; // default value of gemma 2
  799. hparams.n_swa_pattern = 2;
  800. hparams.attn_soft_cap = true;
  801. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  802. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  803. ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
  804. ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
  805. switch (hparams.n_layer) {
  806. case 26: type = LLM_TYPE_2B; break;
  807. case 42: type = LLM_TYPE_9B; break;
  808. case 46: type = LLM_TYPE_27B; break;
  809. default: type = LLM_TYPE_UNKNOWN;
  810. }
  811. } break;
  812. case LLM_ARCH_GEMMA3:
  813. {
  814. hparams.n_swa_pattern = 6;
  815. hparams.rope_freq_base_train_swa = 10000.0f;
  816. hparams.rope_freq_scale_train_swa = 1.0f;
  817. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  818. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  819. switch (hparams.n_layer) {
  820. case 26: type = LLM_TYPE_1B; break;
  821. case 34: type = LLM_TYPE_4B; break;
  822. case 48: type = LLM_TYPE_12B; break;
  823. case 62: type = LLM_TYPE_27B; break;
  824. default: type = LLM_TYPE_UNKNOWN;
  825. }
  826. hparams.f_attention_scale = type == LLM_TYPE_27B
  827. ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
  828. : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
  829. } break;
  830. case LLM_ARCH_STARCODER2:
  831. {
  832. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  833. switch (hparams.n_layer) {
  834. case 30: type = LLM_TYPE_3B; break;
  835. case 32: type = LLM_TYPE_7B; break;
  836. case 40: type = LLM_TYPE_15B; break;
  837. case 52: type = LLM_TYPE_20B; break; // granite
  838. case 88: type = LLM_TYPE_34B; break; // granite
  839. default: type = LLM_TYPE_UNKNOWN;
  840. }
  841. } break;
  842. case LLM_ARCH_MAMBA:
  843. {
  844. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  845. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  846. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  847. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  848. ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
  849. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  850. switch (hparams.n_layer) {
  851. case 24:
  852. switch (hparams.n_embd) {
  853. case 768: type = LLM_TYPE_SMALL; break;
  854. default: type = LLM_TYPE_UNKNOWN;
  855. } break;
  856. case 48:
  857. switch (hparams.n_embd) {
  858. case 1024: type = LLM_TYPE_MEDIUM; break;
  859. case 1536: type = LLM_TYPE_LARGE; break;
  860. case 2048: type = LLM_TYPE_XL; break;
  861. default: type = LLM_TYPE_UNKNOWN;
  862. } break;
  863. case 64:
  864. switch (hparams.n_embd) {
  865. case 2560: type = LLM_TYPE_3B; break;
  866. default: type = LLM_TYPE_UNKNOWN;
  867. } break;
  868. default: type = LLM_TYPE_UNKNOWN;
  869. }
  870. } break;
  871. case LLM_ARCH_XVERSE:
  872. {
  873. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  874. switch (hparams.n_layer) {
  875. case 32: type = LLM_TYPE_7B; break;
  876. case 40: type = LLM_TYPE_13B; break;
  877. case 80: type = LLM_TYPE_65B; break;
  878. default: type = LLM_TYPE_UNKNOWN;
  879. }
  880. } break;
  881. case LLM_ARCH_COMMAND_R:
  882. {
  883. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  884. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  885. switch (hparams.n_layer) {
  886. case 40: type = LLM_TYPE_35B; break;
  887. default: type = LLM_TYPE_UNKNOWN;
  888. }
  889. } break;
  890. case LLM_ARCH_COHERE2:
  891. {
  892. hparams.n_swa_pattern = 4;
  893. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  894. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  895. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  896. switch (hparams.n_layer) {
  897. case 32: type = LLM_TYPE_8B; break;
  898. default: type = LLM_TYPE_UNKNOWN;
  899. }
  900. } break;
  901. case LLM_ARCH_DBRX:
  902. {
  903. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  904. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
  905. switch (hparams.n_layer) {
  906. case 40: type = LLM_TYPE_16x12B; break;
  907. default: type = LLM_TYPE_UNKNOWN;
  908. }
  909. } break;
  910. case LLM_ARCH_OLMO:
  911. {
  912. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  913. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  914. switch (hparams.n_layer) {
  915. case 22: type = LLM_TYPE_1B; break;
  916. case 32: type = LLM_TYPE_7B; break;
  917. case 80: type = LLM_TYPE_70B; break;
  918. default: type = LLM_TYPE_UNKNOWN;
  919. }
  920. } break;
  921. case LLM_ARCH_OLMO2:
  922. {
  923. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  924. switch (hparams.n_layer) {
  925. case 16: type = LLM_TYPE_1B; break;
  926. case 32: type = LLM_TYPE_7B; break;
  927. case 40: type = LLM_TYPE_13B; break;
  928. case 64: type = LLM_TYPE_32B; break;
  929. default: type = LLM_TYPE_UNKNOWN;
  930. }
  931. } break;
  932. case LLM_ARCH_OLMOE:
  933. {
  934. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  935. switch (hparams.n_layer) {
  936. case 16: type = LLM_TYPE_A1_7B; break;
  937. default: type = LLM_TYPE_UNKNOWN;
  938. }
  939. } break;
  940. case LLM_ARCH_OPENELM:
  941. {
  942. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  943. switch (hparams.n_layer) {
  944. case 16: type = LLM_TYPE_270M; break;
  945. case 20: type = LLM_TYPE_450M; break;
  946. case 28: type = LLM_TYPE_1B; break;
  947. case 36: type = LLM_TYPE_3B; break;
  948. default: type = LLM_TYPE_UNKNOWN;
  949. }
  950. } break;
  951. case LLM_ARCH_GPTNEOX:
  952. {
  953. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  954. ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
  955. switch (hparams.n_layer) {
  956. case 6:
  957. switch (hparams.n_ff()) {
  958. case 512: type = LLM_TYPE_14M; break;
  959. case 2048: type = LLM_TYPE_70M; break;
  960. default: type = LLM_TYPE_UNKNOWN;
  961. } break;
  962. case 12:
  963. switch (hparams.n_ff()) {
  964. case 3072: type = LLM_TYPE_160M; break;
  965. default: type = LLM_TYPE_UNKNOWN;
  966. } break;
  967. case 16:
  968. switch (hparams.n_ff()) {
  969. case 8192: type = LLM_TYPE_1B; break;
  970. default: type = LLM_TYPE_UNKNOWN;
  971. } break;
  972. case 24:
  973. switch (hparams.n_ff()) {
  974. case 4096: type = LLM_TYPE_410M; break;
  975. case 8192: type = LLM_TYPE_1_4B; break;
  976. default: type = LLM_TYPE_UNKNOWN;
  977. } break;
  978. case 32:
  979. switch (hparams.n_ff()) {
  980. case 10240: type = LLM_TYPE_2_8B; break;
  981. case 16384: type = LLM_TYPE_6_9B; break;
  982. default: type = LLM_TYPE_UNKNOWN;
  983. } break;
  984. case 36:
  985. switch (hparams.n_ff()) {
  986. case 20480: type = LLM_TYPE_12B; break;
  987. default: type = LLM_TYPE_UNKNOWN;
  988. } break;
  989. case 44:
  990. switch (hparams.n_ff()) {
  991. case 24576: type = LLM_TYPE_20B; break;
  992. default: type = LLM_TYPE_UNKNOWN;
  993. } break;
  994. default: type = LLM_TYPE_UNKNOWN;
  995. }
  996. } break;
  997. case LLM_ARCH_ARCTIC:
  998. {
  999. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1000. if (hparams.n_expert == 128) {
  1001. switch (hparams.n_layer) {
  1002. case 35: type = LLM_TYPE_10B_128x3_66B; break;
  1003. default: type = LLM_TYPE_UNKNOWN;
  1004. }
  1005. } else {
  1006. type = LLM_TYPE_UNKNOWN;
  1007. }
  1008. } break;
  1009. case LLM_ARCH_DEEPSEEK:
  1010. {
  1011. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1012. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1013. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1014. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1015. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1016. switch (hparams.n_layer) {
  1017. case 28: type = LLM_TYPE_20B; break;
  1018. default: type = LLM_TYPE_UNKNOWN;
  1019. }
  1020. } break;
  1021. case LLM_ARCH_DEEPSEEK2:
  1022. {
  1023. bool is_lite = (hparams.n_layer == 27);
  1024. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1025. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1026. if (!is_lite) {
  1027. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  1028. }
  1029. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  1030. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1031. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1032. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1033. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1034. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1035. if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
  1036. // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
  1037. // that have no expert_gating_func model parameter set
  1038. hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
  1039. }
  1040. ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
  1041. switch (hparams.n_layer) {
  1042. case 27: type = LLM_TYPE_16B; break;
  1043. case 60: type = LLM_TYPE_236B; break;
  1044. case 61: type = LLM_TYPE_671B; break;
  1045. default: type = LLM_TYPE_UNKNOWN;
  1046. }
  1047. } break;
  1048. case LLM_ARCH_PLM:
  1049. {
  1050. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1051. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  1052. switch (hparams.n_layer) {
  1053. case 32: type = LLM_TYPE_1_8B; break;
  1054. default: type = LLM_TYPE_UNKNOWN;
  1055. }
  1056. } break;
  1057. case LLM_ARCH_CHATGLM:
  1058. {
  1059. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1060. switch (hparams.n_layer) {
  1061. case 28: {
  1062. if (hparams.n_head(0) == 16) {
  1063. type = LLM_TYPE_1_5B;
  1064. } else {
  1065. type = LLM_TYPE_6B;
  1066. }
  1067. } break;
  1068. case 40: {
  1069. if (hparams.n_head(0) == 24) {
  1070. type = LLM_TYPE_4B;
  1071. } else {
  1072. type = LLM_TYPE_9B;
  1073. }
  1074. } break;
  1075. default: type = LLM_TYPE_UNKNOWN;
  1076. }
  1077. } break;
  1078. case LLM_ARCH_BITNET:
  1079. {
  1080. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1081. switch (hparams.n_layer) {
  1082. case 26: type = LLM_TYPE_3B; break;
  1083. default: type = LLM_TYPE_UNKNOWN;
  1084. }
  1085. } break;
  1086. case LLM_ARCH_T5:
  1087. {
  1088. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1089. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  1090. uint32_t dec_start_token_id;
  1091. if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
  1092. hparams.dec_start_token_id = dec_start_token_id;
  1093. }
  1094. switch (hparams.n_layer) {
  1095. case 6: type = LLM_TYPE_60M; break; // t5-small
  1096. case 8: type = LLM_TYPE_80M; break; // flan-t5-small
  1097. case 12:
  1098. switch (hparams.n_ff()) {
  1099. case 3072: type = LLM_TYPE_220M; break; // t5-base
  1100. case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
  1101. default: type = LLM_TYPE_UNKNOWN;
  1102. } break;
  1103. case 24:
  1104. switch (hparams.n_ff()) {
  1105. case 4096: type = LLM_TYPE_770M; break; // t5-large
  1106. case 2816: type = LLM_TYPE_780M; break; // flan-t5-large
  1107. case 16384: type = LLM_TYPE_3B; break; // t5-3b
  1108. case 5120: type = LLM_TYPE_3B; break; // flan-t5-xl
  1109. case 65536: type = LLM_TYPE_11B; break; // t5-11b
  1110. case 10240: type = LLM_TYPE_11B; break; // flan-t5-xxl
  1111. default: type = LLM_TYPE_UNKNOWN;
  1112. } break;
  1113. default: type = LLM_TYPE_UNKNOWN;
  1114. }
  1115. } break;
  1116. case LLM_ARCH_T5ENCODER:
  1117. {
  1118. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1119. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  1120. type = LLM_TYPE_UNKNOWN;
  1121. } break;
  1122. case LLM_ARCH_JAIS:
  1123. {
  1124. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1125. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  1126. switch (hparams.n_layer) {
  1127. case 24: type = LLM_TYPE_1_3B; break;
  1128. case 40: type = LLM_TYPE_13B; break;
  1129. /* TODO: add variants */
  1130. default: type = LLM_TYPE_UNKNOWN;
  1131. }
  1132. } break;
  1133. case LLM_ARCH_NEMOTRON:
  1134. {
  1135. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1136. switch (hparams.n_layer) {
  1137. case 32: type = LLM_TYPE_4B; break;
  1138. default: type = LLM_TYPE_UNKNOWN;
  1139. }
  1140. } break;
  1141. case LLM_ARCH_EXAONE:
  1142. {
  1143. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1144. switch (hparams.n_layer) {
  1145. case 32: type = LLM_TYPE_8B; break;
  1146. default: type = LLM_TYPE_UNKNOWN;
  1147. }
  1148. } break;
  1149. case LLM_ARCH_RWKV6:
  1150. case LLM_ARCH_RWKV6QWEN2:
  1151. {
  1152. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
  1153. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
  1154. ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
  1155. ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
  1156. ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
  1157. ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
  1158. ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
  1159. switch (hparams.n_layer) {
  1160. case 24: type = LLM_TYPE_1_6B; break;
  1161. case 32:
  1162. switch (hparams.n_embd) {
  1163. case 2560: type = LLM_TYPE_3B; break;
  1164. case 4096: type = LLM_TYPE_7B; break;
  1165. default: type = LLM_TYPE_UNKNOWN;
  1166. } break;
  1167. case 61: type = LLM_TYPE_14B; break;
  1168. case 64: type = LLM_TYPE_32B; break;
  1169. default: type = LLM_TYPE_UNKNOWN;
  1170. }
  1171. } break;
  1172. case LLM_ARCH_RWKV7:
  1173. case LLM_ARCH_ARWKV7:
  1174. {
  1175. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
  1176. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
  1177. ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
  1178. ml.get_key(LLM_KV_ATTENTION_DECAY_LORA_RANK, hparams.n_lora_decay);
  1179. ml.get_key(LLM_KV_ATTENTION_ICLR_LORA_RANK, hparams.n_lora_iclr);
  1180. ml.get_key(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix);
  1181. ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate, false);
  1182. ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
  1183. switch (hparams.n_layer) {
  1184. case 12: type = LLM_TYPE_190M; break;
  1185. case 24:
  1186. switch (hparams.n_embd) {
  1187. case 1024: type = LLM_TYPE_450M; break;
  1188. case 2048: type = LLM_TYPE_1_5B; break;
  1189. default: type = LLM_TYPE_UNKNOWN;
  1190. } break;
  1191. case 28:
  1192. switch (hparams.n_embd) {
  1193. case 1536: type = LLM_TYPE_1_5B; break;
  1194. case 3584: type = LLM_TYPE_7B; break;
  1195. default: type = LLM_TYPE_UNKNOWN;
  1196. } break;
  1197. case 32: type = LLM_TYPE_2_9B; break; // RWKV-7-World
  1198. default: type = LLM_TYPE_UNKNOWN;
  1199. }
  1200. } break;
  1201. case LLM_ARCH_GRANITE:
  1202. case LLM_ARCH_GRANITE_MOE:
  1203. {
  1204. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1205. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1206. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  1207. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  1208. ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
  1209. switch (hparams.n_layer) {
  1210. case 32: type = LLM_TYPE_3B; break;
  1211. case 40: type = LLM_TYPE_3B; break;
  1212. // Add additional layer/vocab/etc checks here for other model sizes
  1213. default: type = LLM_TYPE_UNKNOWN;
  1214. }
  1215. } break;
  1216. case LLM_ARCH_CHAMELEON:
  1217. {
  1218. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1219. hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
  1220. ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
  1221. switch (hparams.n_layer) {
  1222. case 32: type = LLM_TYPE_7B; break;
  1223. case 48: type = LLM_TYPE_34B; break;
  1224. default: type = LLM_TYPE_UNKNOWN;
  1225. }
  1226. } break;
  1227. case LLM_ARCH_WAVTOKENIZER_DEC:
  1228. {
  1229. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1230. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
  1231. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
  1232. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  1233. } break;
  1234. case LLM_ARCH_BAILINGMOE:
  1235. {
  1236. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1237. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1238. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1239. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1240. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1241. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1242. switch (hparams.n_layer) {
  1243. case 28: type = LLM_TYPE_16B; break;
  1244. case 88: type = LLM_TYPE_290B; break;
  1245. default: type = LLM_TYPE_UNKNOWN;
  1246. }
  1247. } break;
  1248. default: throw std::runtime_error("unsupported model architecture");
  1249. }
  1250. pimpl->n_bytes = ml.n_bytes;
  1251. pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
  1252. if (hparams.f_max_alibi_bias > 0.0f) {
  1253. hparams.use_alibi = true;
  1254. }
  1255. hparams.rope_type = llama_model_rope_type(this);
  1256. }
  1257. void llama_model::load_vocab(llama_model_loader & ml) {
  1258. const auto kv = LLM_KV(arch);
  1259. vocab.load(ml, kv);
  1260. }
  1261. bool llama_model::load_tensors(llama_model_loader & ml) {
  1262. const auto & split_mode = params.split_mode;
  1263. const auto & n_gpu_layers = params.n_gpu_layers;
  1264. const auto & use_mlock = params.use_mlock;
  1265. const auto & tensor_split = params.tensor_split;
  1266. const int n_layer = hparams.n_layer;
  1267. const bool use_mmap_buffer = true;
  1268. LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
  1269. // build a list of buffer types for the CPU and GPU devices
  1270. pimpl->cpu_buft_list = make_cpu_buft_list(devices);
  1271. for (auto * dev : devices) {
  1272. buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
  1273. // add CPU buffer types as a fallback
  1274. buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
  1275. pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
  1276. }
  1277. // calculate the split points
  1278. bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
  1279. std::vector<float> splits(n_devices());
  1280. if (all_zero) {
  1281. // default split, by free memory
  1282. for (size_t i = 0; i < n_devices(); ++i) {
  1283. ggml_backend_dev_t dev = devices[i];
  1284. size_t total;
  1285. size_t free;
  1286. ggml_backend_dev_memory(dev, &free, &total);
  1287. splits[i] = free;
  1288. }
  1289. } else {
  1290. std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
  1291. }
  1292. // sum and normalize the splits to get the split points
  1293. float split_sum = 0.0f;
  1294. for (size_t i = 0; i < n_devices(); ++i) {
  1295. split_sum += splits[i];
  1296. splits[i] = split_sum;
  1297. }
  1298. for (size_t i = 0; i < n_devices(); ++i) {
  1299. splits[i] /= split_sum;
  1300. }
  1301. ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  1302. const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
  1303. const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
  1304. auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
  1305. const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
  1306. if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
  1307. LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
  1308. return {cpu_dev, &pimpl->cpu_buft_list};
  1309. }
  1310. const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
  1311. auto * dev = devices.at(layer_gpu);
  1312. LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
  1313. return {dev, &pimpl->gpu_buft_list.at(dev)};
  1314. };
  1315. // assign the input layer
  1316. // there is very little benefit to offloading the input layer, so always keep it on the CPU
  1317. pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
  1318. // assign the repeating layers to the devices according to the splits
  1319. pimpl->dev_layer.resize(n_layer);
  1320. for (int il = 0; il < n_layer; ++il) {
  1321. pimpl->dev_layer[il] = get_layer_buft_list(il);
  1322. }
  1323. // assign the output layer
  1324. pimpl->dev_output = get_layer_buft_list(n_layer);
  1325. // one ggml context per buffer type
  1326. int max_n_tensors = ml.n_tensors;
  1327. max_n_tensors += 1; // duplicated output tensor
  1328. max_n_tensors += n_layer*2; // duplicated rope freq tensors
  1329. const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
  1330. std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
  1331. auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
  1332. auto it = ctx_map.find(buft);
  1333. if (it == ctx_map.end()) {
  1334. ggml_init_params params = {
  1335. /*.mem_size =*/ ctx_size,
  1336. /*.mem_buffer =*/ NULL,
  1337. /*.no_alloc =*/ true,
  1338. };
  1339. ggml_context * ctx = ggml_init(params);
  1340. if (!ctx) {
  1341. throw std::runtime_error(format("failed to create ggml context"));
  1342. }
  1343. ctx_map[buft] = ctx;
  1344. pimpl->ctxs.emplace_back(ctx);
  1345. return ctx;
  1346. }
  1347. return it->second;
  1348. };
  1349. const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
  1350. const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
  1351. // create tensors for the weights
  1352. {
  1353. // note: cast to int64_t since we will use these for the tensor dimensions
  1354. const int64_t n_head = hparams.n_head();
  1355. const int64_t n_head_kv = hparams.n_head_kv();
  1356. const int64_t n_embd = hparams.n_embd;
  1357. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
  1358. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
  1359. const int64_t n_embd_head_k = hparams.n_embd_head_k;
  1360. const int64_t n_embd_head_v = hparams.n_embd_head_v;
  1361. const int64_t n_ff = hparams.n_ff();
  1362. const int64_t n_embd_gqa = n_embd_v_gqa;
  1363. const int64_t n_vocab = vocab.n_tokens();
  1364. const int64_t n_token_types = vocab.n_token_types();
  1365. const int64_t n_rot = hparams.n_rot;
  1366. const int64_t n_expert = hparams.n_expert;
  1367. const int64_t n_expert_used = hparams.n_expert_used;
  1368. const int64_t n_ctx_train = hparams.n_ctx_train;
  1369. if (n_expert > 0 && hparams.n_expert_used == 0) {
  1370. throw std::runtime_error("model has expert layers but no expert layers are used");
  1371. }
  1372. int n_moved_tensors = 0;
  1373. ggml_tensor * first_moved_tensor = nullptr;
  1374. ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
  1375. ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
  1376. auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
  1377. ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
  1378. if (!t_meta) {
  1379. if (flags & TENSOR_NOT_REQUIRED) {
  1380. return nullptr;
  1381. }
  1382. throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
  1383. }
  1384. // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
  1385. // the tensor is duplicated
  1386. // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
  1387. llm_tensor tn_tensor = tn.tensor;
  1388. if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
  1389. tn_tensor = LLM_TENSOR_OUTPUT;
  1390. }
  1391. llm_tensor_info info;
  1392. try {
  1393. info = llm_tensor_info_for(tn_tensor);
  1394. } catch (const std::out_of_range & e) {
  1395. throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
  1396. }
  1397. // skip unused tensors
  1398. if (info.op == GGML_OP_NONE) {
  1399. const size_t nbytes = ggml_nbytes(t_meta);
  1400. LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
  1401. ml.size_data -= nbytes;
  1402. ml.n_created++;
  1403. return nullptr;
  1404. }
  1405. // tensors with "bias" suffix are always used with GGML_OP_ADD
  1406. ggml_op op;
  1407. bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
  1408. if (bias) {
  1409. op = GGML_OP_ADD;
  1410. } else {
  1411. op = info.op;
  1412. }
  1413. // sanity checks
  1414. if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
  1415. if (tn.bid != -1) {
  1416. GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
  1417. }
  1418. } else {
  1419. if (tn.bid == -1) {
  1420. GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
  1421. }
  1422. }
  1423. // select the buffer type for this tensor
  1424. buft_list_t * buft_list;
  1425. switch (info.layer) {
  1426. case LLM_TENSOR_LAYER_INPUT:
  1427. buft_list = pimpl->dev_input.buft_list;
  1428. break;
  1429. case LLM_TENSOR_LAYER_OUTPUT:
  1430. buft_list = pimpl->dev_output.buft_list;
  1431. break;
  1432. case LLM_TENSOR_LAYER_REPEATING:
  1433. buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
  1434. break;
  1435. default:
  1436. GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
  1437. }
  1438. ggml_backend_buffer_type_t buft = nullptr;
  1439. // check overrides
  1440. if (ml.tensor_buft_overrides) {
  1441. std::string tensor_name = tn.str();
  1442. for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
  1443. std::regex pattern(overrides->pattern);
  1444. if (std::regex_search(tensor_name, pattern)) {
  1445. LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft));
  1446. buft = overrides->buft;
  1447. break;
  1448. }
  1449. }
  1450. }
  1451. if (!buft) {
  1452. buft = select_weight_buft(hparams, t_meta, op, *buft_list);
  1453. if (!buft) {
  1454. throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
  1455. }
  1456. }
  1457. // avoid using a host buffer when using mmap
  1458. auto * buft_dev = ggml_backend_buft_get_device(buft);
  1459. if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
  1460. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  1461. buft = ggml_backend_dev_buffer_type(cpu_dev);
  1462. }
  1463. if (buft != buft_list->front().second) {
  1464. n_moved_tensors++;
  1465. if (!first_moved_tensor) {
  1466. first_moved_tensor = t_meta;
  1467. first_moved_from_buft = buft_list->front().second;
  1468. first_moved_to_buft = buft;
  1469. }
  1470. }
  1471. ggml_context * ctx = ctx_for_buft(buft);
  1472. // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
  1473. if (flags & TENSOR_DUPLICATED) {
  1474. ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
  1475. if (t) {
  1476. return t;
  1477. }
  1478. }
  1479. return ml.create_tensor(ctx, tn, ne, flags);
  1480. };
  1481. layers.resize(n_layer);
  1482. // TODO: move to a separate function
  1483. const auto tn = LLM_TN(arch);
  1484. switch (arch) {
  1485. case LLM_ARCH_LLAMA:
  1486. case LLM_ARCH_REFACT:
  1487. case LLM_ARCH_MINICPM:
  1488. case LLM_ARCH_GRANITE:
  1489. case LLM_ARCH_GRANITE_MOE:
  1490. {
  1491. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1492. // output
  1493. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1494. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1495. // if output is NULL, init from the input tok embed
  1496. if (output == NULL) {
  1497. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1498. }
  1499. for (int i = 0; i < n_layer; ++i) {
  1500. auto & layer = layers[i];
  1501. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1502. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  1503. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1504. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  1505. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  1506. // optional bias tensors
  1507. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1508. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1509. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1510. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1511. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1512. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  1513. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1514. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1515. }
  1516. else {
  1517. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1518. }
  1519. if (n_expert == 0) {
  1520. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1521. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1522. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1523. // optional MLP bias
  1524. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  1525. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1526. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  1527. } else {
  1528. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  1529. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
  1530. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  1531. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  1532. }
  1533. }
  1534. } break;
  1535. case LLM_ARCH_LLAMA4:
  1536. {
  1537. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1538. // output
  1539. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1540. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1541. // if output is NULL, init from the input tok embed
  1542. if (output == NULL) {
  1543. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1544. }
  1545. GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0");
  1546. for (int i = 0; i < n_layer; ++i) {
  1547. bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0;
  1548. auto & layer = layers[i];
  1549. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1550. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  1551. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1552. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  1553. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  1554. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1555. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1556. if (is_moe_layer) {
  1557. int n_ff_exp = hparams.n_ff_exp;
  1558. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  1559. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  1560. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
  1561. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  1562. // Shared expert
  1563. const int64_t n_ff_shexp = n_ff_exp;
  1564. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  1565. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
  1566. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  1567. } else {
  1568. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1569. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1570. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1571. }
  1572. }
  1573. } break;
  1574. case LLM_ARCH_DECI:
  1575. {
  1576. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1577. // output
  1578. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1579. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1580. // if output is NULL, init from the input tok embed
  1581. if (output == NULL) {
  1582. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1583. }
  1584. for (int i = 0; i < n_layer; ++i) {
  1585. auto & layer = layers[i];
  1586. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
  1587. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
  1588. const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
  1589. const int64_t n_ff = hparams.n_ff(i);
  1590. const int64_t n_head = hparams.n_head(i);
  1591. const int64_t n_head_kv = hparams.n_head_kv(i);
  1592. if (n_head_kv == 0 && n_head > 0) {
  1593. // linear attention for DeciLMCausalModel
  1594. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1595. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1596. }
  1597. else if (n_head_kv > 0) {
  1598. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1599. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  1600. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1601. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  1602. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  1603. }
  1604. // optional bias tensors
  1605. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1606. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1607. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1608. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1609. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1610. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  1611. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1612. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1613. }
  1614. else {
  1615. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1616. }
  1617. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1618. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1619. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1620. // optional MLP bias
  1621. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  1622. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1623. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  1624. }
  1625. } break;
  1626. case LLM_ARCH_MINICPM3:
  1627. {
  1628. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  1629. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  1630. const int64_t q_lora_rank = hparams.n_lora_q;
  1631. const int64_t kv_lora_rank = hparams.n_lora_kv;
  1632. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1633. // output
  1634. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1635. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1636. // if output is NULL, init from the input tok embed
  1637. if (output == NULL) {
  1638. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1639. }
  1640. for (int i = 0; i < n_layer; ++i) {
  1641. auto & layer = layers[i];
  1642. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1643. layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
  1644. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  1645. layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
  1646. layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
  1647. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
  1648. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
  1649. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
  1650. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1651. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1652. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1653. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1654. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1655. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1656. }
  1657. } break;
  1658. case LLM_ARCH_GROK:
  1659. {
  1660. if (n_expert == 0) {
  1661. throw std::runtime_error("Grok model cannot have zero experts");
  1662. }
  1663. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1664. // output
  1665. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1666. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1667. // if output is NULL, init from the input tok embed
  1668. if (output == NULL) {
  1669. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1670. }
  1671. for (int i = 0; i < n_layer; ++i) {
  1672. auto & layer = layers[i];
  1673. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1674. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1675. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1676. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1677. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1678. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  1679. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1680. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  1681. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
  1682. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  1683. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  1684. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  1685. }
  1686. } break;
  1687. case LLM_ARCH_DBRX:
  1688. {
  1689. if (n_expert == 0) {
  1690. throw std::runtime_error("DBRX model cannot have zero experts");
  1691. }
  1692. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1693. // output
  1694. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1695. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1696. for (int i = 0; i < n_layer; ++i) {
  1697. auto & layer = layers[i];
  1698. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1699. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  1700. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1701. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  1702. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  1703. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  1704. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  1705. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  1706. }
  1707. } break;
  1708. case LLM_ARCH_BAICHUAN:
  1709. {
  1710. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1711. {
  1712. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1713. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1714. }
  1715. for (int i = 0; i < n_layer; ++i) {
  1716. auto & layer = layers[i];
  1717. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1718. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1719. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1720. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1721. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1722. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1723. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1724. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1725. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1726. }
  1727. } break;
  1728. case LLM_ARCH_FALCON:
  1729. {
  1730. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1731. // output
  1732. {
  1733. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1734. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  1735. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1736. if (!output) {
  1737. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
  1738. }
  1739. }
  1740. for (int i = 0; i < n_layer; ++i) {
  1741. auto & layer = layers[i];
  1742. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1743. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  1744. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1745. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1746. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  1747. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1748. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1749. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1750. }
  1751. } break;
  1752. case LLM_ARCH_STARCODER:
  1753. {
  1754. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1755. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  1756. // output
  1757. {
  1758. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1759. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  1760. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1761. if (!output) {
  1762. // needs to be on GPU
  1763. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1764. }
  1765. }
  1766. for (int i = 0; i < n_layer; ++i) {
  1767. auto & layer = layers[i];
  1768. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1769. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  1770. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  1771. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  1772. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1773. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  1774. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1775. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  1776. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1777. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  1778. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1779. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  1780. }
  1781. } break;
  1782. case LLM_ARCH_BERT:
  1783. case LLM_ARCH_NOMIC_BERT:
  1784. {
  1785. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1786. type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
  1787. if (arch == LLM_ARCH_BERT) {
  1788. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  1789. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
  1790. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  1791. cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
  1792. cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {1}, TENSOR_NOT_REQUIRED);
  1793. }
  1794. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  1795. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  1796. for (int i = 0; i < n_layer; ++i) {
  1797. auto & layer = layers[i];
  1798. if (arch == LLM_ARCH_BERT) {
  1799. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1800. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  1801. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1802. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  1803. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1804. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  1805. } else {
  1806. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  1807. }
  1808. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1809. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  1810. layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
  1811. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1812. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1813. if (arch == LLM_ARCH_BERT) {
  1814. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  1815. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  1816. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  1817. } else {
  1818. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1819. }
  1820. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  1821. layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
  1822. }
  1823. } break;
  1824. case LLM_ARCH_JINA_BERT_V2:
  1825. {
  1826. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
  1827. type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
  1828. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
  1829. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); //LayerNorm bias
  1830. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
  1831. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {1}, TENSOR_NOT_REQUIRED);
  1832. for (int i = 0; i < n_layer; ++i) {
  1833. auto & layer = layers[i]; // JinaBertLayer
  1834. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1835. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  1836. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1837. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1838. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1839. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  1840. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1841. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1842. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1843. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  1844. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
  1845. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
  1846. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
  1847. layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
  1848. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1849. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1850. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1851. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1852. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1853. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  1854. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  1855. layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
  1856. }
  1857. } break;
  1858. case LLM_ARCH_BLOOM:
  1859. {
  1860. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1861. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  1862. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  1863. // output
  1864. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1865. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  1866. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1867. // if output is NULL, init from the input tok embed
  1868. if (output == NULL) {
  1869. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1870. }
  1871. for (int i = 0; i < n_layer; ++i) {
  1872. auto & layer = layers[i];
  1873. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1874. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  1875. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  1876. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  1877. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1878. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  1879. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1880. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  1881. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1882. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  1883. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1884. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  1885. }
  1886. } break;
  1887. case LLM_ARCH_MPT:
  1888. {
  1889. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1890. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
  1891. // output
  1892. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1893. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  1894. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1895. if (!output) {
  1896. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
  1897. }
  1898. for (int i = 0; i < n_layer; ++i) {
  1899. auto & layer = layers[i];
  1900. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1901. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1902. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  1903. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1904. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1905. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1906. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1907. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1908. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1909. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1910. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1911. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  1912. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1913. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1914. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1915. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1916. // AWQ ScaleActivation layer
  1917. layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
  1918. }
  1919. } break;
  1920. case LLM_ARCH_STABLELM:
  1921. {
  1922. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1923. // output
  1924. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  1925. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1926. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1927. for (int i = 0; i < n_layer; ++i) {
  1928. auto & layer = layers[i];
  1929. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1930. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  1931. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1932. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1933. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1934. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1935. // optional bias tensors, present in Stable LM 2 1.6B
  1936. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1937. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1938. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1939. // optional q and k layernorms, present in StableLM 2 12B
  1940. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
  1941. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
  1942. // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
  1943. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1944. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1945. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1946. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1947. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1948. }
  1949. } break;
  1950. case LLM_ARCH_QWEN:
  1951. {
  1952. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1953. // output
  1954. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1955. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1956. for (int i = 0; i < n_layer; ++i) {
  1957. auto & layer = layers[i];
  1958. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1959. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
  1960. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
  1961. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1962. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1963. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
  1964. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
  1965. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}, 0);
  1966. }
  1967. } break;
  1968. case LLM_ARCH_QWEN2:
  1969. case LLM_ARCH_QWEN2VL:
  1970. {
  1971. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1972. // output
  1973. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1974. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1975. // if output is NULL, init from the input tok embed
  1976. if (output == NULL) {
  1977. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1978. }
  1979. for (int i = 0; i < n_layer; ++i) {
  1980. auto & layer = layers[i];
  1981. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1982. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1983. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1984. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1985. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1986. // optional bias tensors
  1987. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  1988. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  1989. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  1990. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1991. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1992. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1993. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1994. }
  1995. } break;
  1996. case LLM_ARCH_QWEN2MOE:
  1997. {
  1998. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1999. // output
  2000. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2001. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2002. for (int i = 0; i < n_layer; ++i) {
  2003. auto & layer = layers[i];
  2004. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2005. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2006. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2007. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2008. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2009. // optional bias tensors
  2010. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2011. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2012. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2013. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2014. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2015. if (n_expert == 0) {
  2016. throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
  2017. }
  2018. if (n_expert_used == 0) {
  2019. throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
  2020. }
  2021. // MoE branch
  2022. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2023. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2024. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2025. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2026. // Shared expert branch
  2027. const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
  2028. layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
  2029. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2030. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
  2031. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2032. }
  2033. } break;
  2034. case LLM_ARCH_PHI2:
  2035. {
  2036. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2037. // output
  2038. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2039. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2040. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2041. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, 0);
  2042. for (int i = 0; i < n_layer; ++i) {
  2043. auto & layer = layers[i];
  2044. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2045. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2046. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2047. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2048. if (layer.wqkv == nullptr) {
  2049. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2050. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2051. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2052. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2053. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2054. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2055. }
  2056. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2057. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2058. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2059. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2060. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2061. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2062. }
  2063. } break;
  2064. case LLM_ARCH_PHI3:
  2065. {
  2066. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2067. // output
  2068. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2069. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2070. // if output is NULL, init from the input tok embed
  2071. if (output == NULL) {
  2072. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2073. }
  2074. for (int i = 0; i < n_layer; ++i) {
  2075. auto & layer = layers[i];
  2076. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  2077. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
  2078. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  2079. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  2080. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  2081. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
  2082. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2083. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2084. }
  2085. } break;
  2086. case LLM_ARCH_PHIMOE:
  2087. {
  2088. const int64_t n_embd_head = n_embd / n_head;
  2089. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2090. // output
  2091. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2092. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2093. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
  2094. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0);
  2095. for (int i = 0; i < n_layer; ++i) {
  2096. auto & layer = layers[i];
  2097. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  2098. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
  2099. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
  2100. if (layer.wqkv == nullptr) {
  2101. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2102. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2103. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2104. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2105. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2106. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2107. }
  2108. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  2109. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
  2110. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  2111. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
  2112. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2113. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2114. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  2115. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2116. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2117. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2118. }
  2119. } break;
  2120. case LLM_ARCH_PLAMO:
  2121. {
  2122. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2123. // output
  2124. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2125. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2126. for (int i = 0; i < n_layer; ++i) {
  2127. auto & layer = layers[i];
  2128. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2129. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2130. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2131. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2132. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2133. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2134. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2135. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2136. }
  2137. } break;
  2138. case LLM_ARCH_GPT2:
  2139. {
  2140. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2141. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  2142. // output
  2143. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2144. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2145. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2146. // if output is NULL, init from the input tok embed
  2147. if (output == NULL) {
  2148. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2149. }
  2150. for (int i = 0; i < n_layer; ++i) {
  2151. auto & layer = layers[i];
  2152. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2153. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2154. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2155. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2156. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2157. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2158. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2159. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2160. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2161. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2162. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2163. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2164. }
  2165. } break;
  2166. case LLM_ARCH_CODESHELL:
  2167. {
  2168. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2169. // if tok embd is NULL, init from output
  2170. if (tok_embd == NULL) {
  2171. tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2172. }
  2173. // output
  2174. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2175. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2176. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2177. for (int i = 0; i < n_layer; ++i) {
  2178. auto & layer = layers[i];
  2179. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2180. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2181. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2182. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2183. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2184. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2185. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2186. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2187. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2188. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2189. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2190. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2191. }
  2192. } break;
  2193. case LLM_ARCH_ORION:
  2194. {
  2195. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2196. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2197. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2198. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2199. for (int i = 0; i < n_layer; ++i) {
  2200. auto & layer = layers[i];
  2201. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2202. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2203. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2204. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2205. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2206. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2207. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2208. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2209. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2210. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2211. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2212. }
  2213. } break;
  2214. case LLM_ARCH_INTERNLM2:
  2215. {
  2216. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2217. // output
  2218. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2219. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2220. for (int i = 0; i < n_layer; ++i) {
  2221. auto & layer = layers[i];
  2222. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2223. // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2224. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2225. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2226. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2227. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2228. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2229. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2230. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2231. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2232. }
  2233. } break;
  2234. case LLM_ARCH_GEMMA:
  2235. {
  2236. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2237. // output
  2238. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2239. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  2240. for (int i = 0; i < n_layer; ++i) {
  2241. auto & layer = layers[i];
  2242. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2243. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2244. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2245. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2246. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2247. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2248. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2249. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2250. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2251. }
  2252. } break;
  2253. case LLM_ARCH_GEMMA2:
  2254. {
  2255. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2256. // output
  2257. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2258. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  2259. for (int i = 0; i < n_layer; ++i) {
  2260. auto & layer = layers[i];
  2261. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2262. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2263. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2264. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2265. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2266. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  2267. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2268. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2269. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2270. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2271. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  2272. }
  2273. } break;
  2274. case LLM_ARCH_GEMMA3:
  2275. {
  2276. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2277. // output
  2278. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2279. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2280. // if output is NULL, init from the input tok embed
  2281. if (output == NULL) {
  2282. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2283. }
  2284. for (int i = 0; i < n_layer; ++i) {
  2285. auto & layer = layers[i];
  2286. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2287. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2288. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2289. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2290. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2291. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  2292. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2293. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2294. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2295. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2296. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2297. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2298. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  2299. }
  2300. } break;
  2301. case LLM_ARCH_STARCODER2:
  2302. {
  2303. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2304. // output
  2305. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2306. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2307. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2308. // if output is NULL, init from the input tok embed
  2309. if (output == NULL) {
  2310. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2311. }
  2312. for (int i = 0; i < n_layer; ++i) {
  2313. auto & layer = layers[i];
  2314. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2315. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2316. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2317. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2318. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2319. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2320. // optional bias tensors
  2321. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2322. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2323. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2324. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2325. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2326. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2327. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2328. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2329. // optional bias tensors
  2330. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2331. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}, 0);
  2332. }
  2333. } break;
  2334. case LLM_ARCH_MAMBA:
  2335. {
  2336. const int64_t d_conv = hparams.ssm_d_conv;
  2337. const int64_t d_inner = hparams.ssm_d_inner;
  2338. const int64_t d_state = hparams.ssm_d_state;
  2339. const int64_t dt_rank = hparams.ssm_dt_rank;
  2340. // only an expansion factor of 2 is supported for now
  2341. if (2 * n_embd != d_inner) {
  2342. throw std::runtime_error("only an expansion factor of 2 is supported for now");
  2343. }
  2344. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2345. // output
  2346. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2347. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2348. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  2349. if (output == NULL) {
  2350. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2351. }
  2352. for (int i = 0; i < n_layer; ++i) {
  2353. auto & layer = layers[i];
  2354. // norm
  2355. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2356. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
  2357. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
  2358. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
  2359. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
  2360. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
  2361. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
  2362. // no "weight" suffix for these
  2363. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
  2364. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
  2365. // out_proj
  2366. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  2367. }
  2368. } break;
  2369. case LLM_ARCH_XVERSE:
  2370. {
  2371. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2372. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2373. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2374. for (int i = 0; i < n_layer; ++i) {
  2375. auto & layer = layers[i];
  2376. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2377. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2378. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2379. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2380. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2381. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2382. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2383. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2384. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2385. }
  2386. } break;
  2387. case LLM_ARCH_COMMAND_R:
  2388. {
  2389. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2390. // output
  2391. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2392. // init output from the input tok embed
  2393. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2394. for (int i = 0; i < n_layer; ++i) {
  2395. auto & layer = layers[i];
  2396. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2397. if (n_layer >= 64){
  2398. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
  2399. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
  2400. }
  2401. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2402. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2403. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2404. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2405. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2406. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2407. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2408. }
  2409. } break;
  2410. case LLM_ARCH_COHERE2:
  2411. {
  2412. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2413. // output
  2414. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2415. // init output from the input tok embed
  2416. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
  2417. TENSOR_DUPLICATED);
  2418. for (int i = 0; i < n_layer; ++i) {
  2419. auto & layer = layers[i];
  2420. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  2421. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
  2422. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
  2423. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
  2424. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  2425. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
  2426. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  2427. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
  2428. }
  2429. }
  2430. break;
  2431. case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
  2432. {
  2433. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2434. // output
  2435. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2436. // if output is NULL, init from the input tok embed
  2437. if (output == NULL) {
  2438. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2439. }
  2440. for (int i = 0; i < n_layer; ++i) {
  2441. auto & layer = layers[i];
  2442. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2443. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2444. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2445. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2446. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2447. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2448. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2449. }
  2450. } break;
  2451. case LLM_ARCH_OLMO2:
  2452. {
  2453. const int64_t n_embd_head = n_embd / n_head;
  2454. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2455. // output
  2456. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2457. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2458. for (int i = 0; i < n_layer; ++i) {
  2459. auto & layer = layers[i];
  2460. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2461. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2462. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2463. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2464. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
  2465. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
  2466. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  2467. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2468. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2469. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2470. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  2471. }
  2472. } break;
  2473. case LLM_ARCH_OLMOE:
  2474. {
  2475. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2476. // output
  2477. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2478. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2479. for (int i = 0; i < n_layer; ++i) {
  2480. auto & layer = layers[i];
  2481. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2482. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2483. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2484. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2485. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2486. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
  2487. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
  2488. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2489. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2490. if (n_expert == 0) {
  2491. throw std::runtime_error("n_expert must be > 0");
  2492. }
  2493. if (n_expert_used == 0) {
  2494. throw std::runtime_error("n_expert_used must be > 0");
  2495. }
  2496. // MoE branch
  2497. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2498. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  2499. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2500. }
  2501. } break;
  2502. case LLM_ARCH_OPENELM:
  2503. {
  2504. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2505. // output
  2506. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2507. // init output from the input tok embed
  2508. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2509. for (int i = 0; i < n_layer; ++i) {
  2510. const int64_t n_head = hparams.n_head(i);
  2511. const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head;
  2512. const int64_t n_ff = hparams.n_ff(i);
  2513. auto & layer = layers[i];
  2514. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2515. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
  2516. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2517. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2518. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
  2519. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2520. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2521. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2522. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2523. }
  2524. } break;
  2525. case LLM_ARCH_GPTNEOX:
  2526. {
  2527. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2528. // output
  2529. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2530. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2531. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2532. for (int i = 0; i < n_layer; ++i) {
  2533. auto & layer = layers[i];
  2534. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2535. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2536. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2537. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2538. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2539. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2540. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2541. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2542. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2543. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2544. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2545. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2546. }
  2547. } break;
  2548. case LLM_ARCH_ARCTIC:
  2549. {
  2550. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2551. // output
  2552. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2553. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2554. // if output is NULL, init from the input tok embed
  2555. if (output == NULL) {
  2556. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2557. }
  2558. for (int i = 0; i < n_layer; ++i) {
  2559. auto & layer = layers[i];
  2560. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2561. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2562. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2563. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2564. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2565. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2566. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
  2567. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
  2568. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd}, 0);
  2569. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2570. layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
  2571. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
  2572. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  2573. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2574. }
  2575. } break;
  2576. case LLM_ARCH_DEEPSEEK:
  2577. {
  2578. const int64_t n_ff_exp = hparams.n_ff_exp;
  2579. const int64_t n_expert_shared = hparams.n_expert_shared;
  2580. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2581. // output
  2582. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2583. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2584. for (int i = 0; i < n_layer; ++i) {
  2585. auto & layer = layers[i];
  2586. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2587. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2588. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2589. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2590. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2591. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2592. if (i < (int) hparams.n_layer_dense_lead) {
  2593. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2594. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2595. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2596. } else {
  2597. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2598. if (n_expert == 0) {
  2599. throw std::runtime_error("n_expert must be > 0");
  2600. }
  2601. if (n_expert_used == 0) {
  2602. throw std::runtime_error("n_expert_used must be > 0");
  2603. }
  2604. // MoE branch
  2605. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2606. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2607. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2608. // Shared expert branch
  2609. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  2610. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  2611. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  2612. }
  2613. }
  2614. } break;
  2615. case LLM_ARCH_DEEPSEEK2:
  2616. {
  2617. const bool is_lite = (hparams.n_layer == 27);
  2618. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  2619. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  2620. const int64_t q_lora_rank = hparams.n_lora_q;
  2621. const int64_t kv_lora_rank = hparams.n_lora_kv;
  2622. const int64_t n_ff_exp = hparams.n_ff_exp;
  2623. const int64_t n_expert_shared = hparams.n_expert_shared;
  2624. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2625. // output
  2626. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2627. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2628. for (int i = 0; i < n_layer; ++i) {
  2629. auto & layer = layers[i];
  2630. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2631. if (!is_lite) {
  2632. layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
  2633. }
  2634. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  2635. if (!is_lite) {
  2636. layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
  2637. layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
  2638. } else {
  2639. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2640. }
  2641. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
  2642. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
  2643. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
  2644. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2645. if (i < (int) hparams.n_layer_dense_lead) {
  2646. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2647. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2648. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2649. } else {
  2650. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2651. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  2652. if (n_expert == 0) {
  2653. throw std::runtime_error("n_expert must be > 0");
  2654. }
  2655. if (n_expert_used == 0) {
  2656. throw std::runtime_error("n_expert_used must be > 0");
  2657. }
  2658. // MoE branch
  2659. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2660. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2661. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2662. // Shared expert branch
  2663. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  2664. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  2665. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  2666. }
  2667. }
  2668. } break;
  2669. case LLM_ARCH_PLM:
  2670. {
  2671. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  2672. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  2673. const int64_t kv_lora_rank = hparams.n_lora_kv;
  2674. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2675. // output
  2676. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2677. // output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2678. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2679. for (int i = 0; i < n_layer; ++i) {
  2680. auto & layer = layers[i];
  2681. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2682. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2683. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
  2684. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  2685. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
  2686. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
  2687. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2688. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2689. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2690. }
  2691. } break;
  2692. case LLM_ARCH_BITNET:
  2693. {
  2694. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2695. // output
  2696. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2697. for (int i = 0; i < n_layer; ++i) {
  2698. auto & layer = layers[i];
  2699. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2700. layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
  2701. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2702. layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  2703. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2704. layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  2705. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2706. layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  2707. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2708. layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  2709. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2710. layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
  2711. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2712. layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  2713. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2714. layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  2715. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2716. layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  2717. }
  2718. } break;
  2719. case LLM_ARCH_T5:
  2720. {
  2721. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  2722. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2723. // output
  2724. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2725. output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2726. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2727. // if output is NULL, init from the input tok embed
  2728. if (output == NULL) {
  2729. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2730. }
  2731. for (int i = 0; i < n_layer; ++i) {
  2732. auto & layer = layers[i];
  2733. layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
  2734. layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  2735. layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2736. layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2737. layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2738. layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  2739. layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
  2740. layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2741. layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2742. layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2743. layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
  2744. layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  2745. layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2746. layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2747. layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2748. layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  2749. layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
  2750. // this tensor seems to be unused in HF transformers implementation
  2751. layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  2752. layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2753. layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2754. layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2755. layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  2756. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
  2757. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2758. layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2759. layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2760. }
  2761. } break;
  2762. case LLM_ARCH_T5ENCODER:
  2763. {
  2764. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  2765. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2766. // output
  2767. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2768. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2769. // if output is NULL, init from the input tok embed
  2770. if (output == NULL) {
  2771. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2772. }
  2773. for (int i = 0; i < n_layer; ++i) {
  2774. auto & layer = layers[i];
  2775. layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
  2776. layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  2777. layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2778. layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2779. layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2780. layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  2781. layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
  2782. layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2783. layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2784. layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2785. }
  2786. } break;
  2787. case LLM_ARCH_JAIS:
  2788. {
  2789. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2790. // output
  2791. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2792. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2793. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2794. for (int i = 0; i < n_layer; ++i) {
  2795. auto & layer = layers[i];
  2796. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2797. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2798. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2799. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2800. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2801. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2802. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2803. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2804. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2805. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2806. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2807. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, 0);
  2808. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2809. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2810. }
  2811. } break;
  2812. case LLM_ARCH_CHATGLM:
  2813. {
  2814. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2815. // output
  2816. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2817. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2818. for (int i = 0; i < n_layer; ++i) {
  2819. auto & layer = layers[i];
  2820. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2821. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2822. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2823. if (layer.wqkv == nullptr) {
  2824. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2825. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2826. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2827. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2828. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2829. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2830. }
  2831. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2832. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2833. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  2834. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2835. }
  2836. } break;
  2837. case LLM_ARCH_NEMOTRON:
  2838. {
  2839. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2840. // output
  2841. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2842. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2843. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2844. for (int i = 0; i < n_layer; ++i) {
  2845. auto & layer = layers[i];
  2846. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2847. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2848. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2849. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2850. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2851. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2852. // optional bias tensors
  2853. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2854. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2855. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2856. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2857. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2858. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2859. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2860. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2861. // optional MLP bias
  2862. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2863. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2864. }
  2865. } break;
  2866. case LLM_ARCH_EXAONE:
  2867. {
  2868. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2869. // output
  2870. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2871. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2872. // if output is NULL, init from the input tok embed
  2873. if (output == NULL) {
  2874. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2875. }
  2876. for (int i = 0; i < n_layer; ++i) {
  2877. auto & layer = layers[i];
  2878. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2879. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2880. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2881. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2882. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2883. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2884. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2885. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2886. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2887. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2888. }
  2889. } break;
  2890. case LLM_ARCH_RWKV6:
  2891. {
  2892. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2893. // Block 0, LN0
  2894. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  2895. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  2896. // output
  2897. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2898. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2899. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2900. const int time_mix_extra_dim = hparams.time_mix_extra_dim;
  2901. const int time_decay_extra_dim = hparams.time_decay_extra_dim;
  2902. const int head_size = hparams.wkv_head_size;
  2903. const int attn_hidden_size = n_embd;
  2904. const int ffn_size = hparams.n_ff_arr[0];
  2905. for (int i = 0; i < n_layer; ++i) {
  2906. auto & layer = layers[i];
  2907. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2908. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2909. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
  2910. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
  2911. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
  2912. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
  2913. layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
  2914. layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  2915. layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  2916. layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  2917. layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  2918. layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  2919. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED);
  2920. GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
  2921. layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
  2922. layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
  2923. layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
  2924. layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
  2925. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  2926. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  2927. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  2928. layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
  2929. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
  2930. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
  2931. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  2932. layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
  2933. layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
  2934. layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
  2935. layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
  2936. layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
  2937. }
  2938. } break;
  2939. case LLM_ARCH_RWKV6QWEN2:
  2940. {
  2941. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2942. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2943. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  2944. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2945. const int time_mix_extra_dim = hparams.time_mix_extra_dim;
  2946. const int time_decay_extra_dim = hparams.time_decay_extra_dim;
  2947. const int head_size = hparams.wkv_head_size;
  2948. const int attn_hidden_size = n_embd;
  2949. const int n_head_kv = hparams.n_head_kv();
  2950. int attn_key_value_size;
  2951. if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
  2952. attn_key_value_size = attn_hidden_size;
  2953. } else {
  2954. attn_key_value_size = n_head_kv * head_size;
  2955. }
  2956. for (int i = 0; i < n_layer; ++i) {
  2957. auto & layer = layers[i];
  2958. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2959. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
  2960. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
  2961. layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
  2962. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
  2963. layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, TENSOR_NOT_REQUIRED);
  2964. layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
  2965. layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
  2966. layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
  2967. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
  2968. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
  2969. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  2970. layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
  2971. // optional bias tensors
  2972. layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
  2973. layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
  2974. layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, TENSOR_NOT_REQUIRED);
  2975. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  2976. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2977. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2978. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2979. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2980. }
  2981. } break;
  2982. case LLM_ARCH_RWKV7:
  2983. {
  2984. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2985. // Block 0, LN0
  2986. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  2987. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  2988. // output
  2989. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2990. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2991. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2992. const int n_lora_decay = hparams.n_lora_decay;
  2993. const int n_lora_iclr = hparams.n_lora_iclr;
  2994. const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
  2995. const int n_lora_gate = hparams.n_lora_gate;
  2996. const int attn_hidden_size = n_embd;
  2997. const int ffn_size = hparams.n_ff_arr[0];
  2998. for (int i = 0; i < n_layer; ++i) {
  2999. auto & layer = layers[i];
  3000. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3001. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3002. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
  3003. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
  3004. layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
  3005. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
  3006. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
  3007. layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
  3008. layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
  3009. layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
  3010. if (i == 0) {
  3011. // actually not used
  3012. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  3013. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
  3014. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
  3015. } else {
  3016. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  3017. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
  3018. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
  3019. }
  3020. layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, 0);
  3021. layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, 0);
  3022. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
  3023. layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
  3024. layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
  3025. layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
  3026. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  3027. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  3028. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  3029. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
  3030. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
  3031. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  3032. layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
  3033. layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
  3034. layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
  3035. }
  3036. } break;
  3037. case LLM_ARCH_ARWKV7:
  3038. {
  3039. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3040. // output
  3041. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3042. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3043. const int n_lora_decay = hparams.n_lora_decay;
  3044. const int n_lora_iclr = hparams.n_lora_iclr;
  3045. const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
  3046. const int n_lora_gate = hparams.n_lora_gate;
  3047. const int attn_hidden_size = n_embd;
  3048. for (int i = 0; i < n_layer; ++i) {
  3049. auto & layer = layers[i];
  3050. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3051. layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
  3052. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
  3053. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
  3054. layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
  3055. layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
  3056. layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
  3057. if (i == 0) {
  3058. // actually not used
  3059. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  3060. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
  3061. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
  3062. } else {
  3063. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  3064. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
  3065. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
  3066. }
  3067. layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, TENSOR_NOT_REQUIRED);
  3068. layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, TENSOR_NOT_REQUIRED);
  3069. try {
  3070. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
  3071. } catch(std::runtime_error & e) {
  3072. // ARWKV models may not have gate tensors
  3073. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
  3074. }
  3075. layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
  3076. layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
  3077. layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
  3078. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  3079. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  3080. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  3081. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3082. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3083. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  3084. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3085. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3086. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3087. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3088. }
  3089. } break;
  3090. case LLM_ARCH_CHAMELEON:
  3091. {
  3092. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3093. // output
  3094. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3095. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3096. // if output is NULL, init from the input tok embed
  3097. if (output == NULL) {
  3098. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3099. }
  3100. for (int i = 0; i < n_layer; ++i) {
  3101. auto & layer = layers[i];
  3102. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3103. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
  3104. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
  3105. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
  3106. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
  3107. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3108. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3109. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3110. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3111. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3112. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3113. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3114. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3115. }
  3116. } break;
  3117. case LLM_ARCH_WAVTOKENIZER_DEC:
  3118. {
  3119. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
  3120. conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
  3121. conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
  3122. // posnet
  3123. {
  3124. const int64_t n_embd = hparams.posnet.n_embd;
  3125. for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
  3126. auto & layer = layers[i].posnet;
  3127. // posnet:
  3128. //
  3129. // - resnet
  3130. // - resnet
  3131. // - attn
  3132. // - resnet
  3133. // - resnet
  3134. // - norm
  3135. //
  3136. switch (i) {
  3137. case 0:
  3138. case 1:
  3139. case 3:
  3140. case 4:
  3141. {
  3142. layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
  3143. layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
  3144. layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
  3145. layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
  3146. layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
  3147. layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
  3148. layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
  3149. layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
  3150. } break;
  3151. case 2:
  3152. {
  3153. layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
  3154. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
  3155. layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
  3156. layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
  3157. layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
  3158. layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
  3159. layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
  3160. layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
  3161. layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
  3162. layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
  3163. } break;
  3164. case 5:
  3165. {
  3166. layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
  3167. layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
  3168. } break;
  3169. default: GGML_ABORT("unknown posnet layer");
  3170. };
  3171. }
  3172. }
  3173. GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
  3174. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
  3175. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
  3176. // convnext
  3177. {
  3178. const int64_t n_embd = hparams.convnext.n_embd;
  3179. for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
  3180. auto & layer = layers[i].convnext;
  3181. layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
  3182. layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
  3183. layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
  3184. layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
  3185. layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
  3186. layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
  3187. layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
  3188. layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
  3189. layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
  3190. }
  3191. // output
  3192. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3193. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3194. }
  3195. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
  3196. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
  3197. } break;
  3198. case LLM_ARCH_BAILINGMOE:
  3199. {
  3200. const int64_t n_ff_exp = hparams.n_ff_exp;
  3201. const int64_t n_expert_shared = hparams.n_expert_shared;
  3202. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3203. // output
  3204. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3205. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3206. for (int i = 0; i < n_layer; ++i) {
  3207. auto & layer = layers[i];
  3208. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3209. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
  3210. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  3211. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  3212. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
  3213. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3214. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3215. if (n_expert == 0) {
  3216. throw std::runtime_error("n_expert must be > 0");
  3217. }
  3218. if (n_expert_used == 0) {
  3219. throw std::runtime_error("n_expert_used must be > 0");
  3220. }
  3221. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3222. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  3223. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3224. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3225. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  3226. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3227. }
  3228. } break;
  3229. default:
  3230. throw std::runtime_error("unknown architecture");
  3231. }
  3232. if (n_moved_tensors > 0) {
  3233. LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
  3234. __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
  3235. ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
  3236. }
  3237. }
  3238. ml.done_getting_tensors();
  3239. ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
  3240. pimpl->mappings.reserve(ml.mappings.size());
  3241. // create the backend buffers
  3242. std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
  3243. ctx_bufs.reserve(ctx_map.size());
  3244. // Ensure we have enough capacity for the maximum backend buffer we will potentially create
  3245. const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
  3246. pimpl->bufs.reserve(n_max_backend_buffer);
  3247. for (auto & it : ctx_map) {
  3248. ggml_backend_buffer_type_t buft = it.first;
  3249. ggml_context * ctx = it.second;
  3250. // skip contexts without tensors
  3251. if (ggml_get_first_tensor(ctx) == nullptr) {
  3252. continue;
  3253. }
  3254. llama_buf_map buf_map;
  3255. buf_map.reserve(n_max_backend_buffer);
  3256. // check if it is possible to use buffer_from_host_ptr with this buffer type
  3257. ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
  3258. if (!dev) {
  3259. // FIXME: workaround for CPU backend buft having a NULL device
  3260. dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  3261. }
  3262. ggml_backend_dev_props props;
  3263. ggml_backend_dev_get_props(dev, &props);
  3264. bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
  3265. bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
  3266. if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
  3267. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  3268. // only the mmap region containing the tensors in the model is mapped to the backend buffer
  3269. // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
  3270. // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
  3271. void * addr = nullptr;
  3272. size_t first, last; // NOLINT
  3273. ml.get_mapping_range(&first, &last, &addr, idx, ctx);
  3274. if (first >= last) {
  3275. continue;
  3276. }
  3277. const size_t max_size = ggml_get_max_tensor_size(ctx);
  3278. ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
  3279. if (buf == nullptr) {
  3280. throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
  3281. }
  3282. pimpl->bufs.emplace_back(buf);
  3283. buf_map.emplace(idx, buf);
  3284. }
  3285. }
  3286. else {
  3287. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
  3288. if (buf == nullptr) {
  3289. throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
  3290. }
  3291. pimpl->bufs.emplace_back(buf);
  3292. if (use_mlock && ggml_backend_buffer_is_host(buf)) {
  3293. pimpl->mlock_bufs.emplace_back(new llama_mlock);
  3294. auto & mlock_buf = pimpl->mlock_bufs.back();
  3295. mlock_buf->init (ggml_backend_buffer_get_base(buf));
  3296. mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
  3297. }
  3298. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  3299. buf_map.emplace(idx, buf);
  3300. }
  3301. }
  3302. if (pimpl->bufs.empty()) {
  3303. throw std::runtime_error("failed to allocate buffer");
  3304. }
  3305. for (auto & buf : buf_map) {
  3306. // indicate that this buffer contains weights
  3307. // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
  3308. ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
  3309. }
  3310. ctx_bufs.emplace_back(ctx, buf_map);
  3311. }
  3312. if (llama_supports_gpu_offload()) {
  3313. const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
  3314. LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
  3315. if (n_gpu_layers > (int) hparams.n_layer) {
  3316. LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
  3317. }
  3318. const int max_backend_supported_layers = hparams.n_layer + 1;
  3319. const int max_offloadable_layers = hparams.n_layer + 1;
  3320. LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
  3321. }
  3322. // print memory requirements per buffer type
  3323. for (auto & buf : pimpl->bufs) {
  3324. LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
  3325. }
  3326. // populate tensors_by_name
  3327. for (auto & ctx : pimpl->ctxs) {
  3328. for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
  3329. tensors_by_name.emplace_back(ggml_get_name(cur), cur);
  3330. }
  3331. }
  3332. // load tensor data
  3333. for (auto & it : ctx_bufs) {
  3334. ggml_context * ctx = it.first;
  3335. auto & bufs = it.second;
  3336. if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
  3337. return false;
  3338. }
  3339. }
  3340. if (use_mmap_buffer) {
  3341. for (auto & mapping : ml.mappings) {
  3342. pimpl->mappings.emplace_back(std::move(mapping));
  3343. }
  3344. }
  3345. return true;
  3346. }
  3347. std::string llama_model::arch_name() const {
  3348. return llm_arch_name(arch);
  3349. }
  3350. std::string llama_model::type_name() const {
  3351. return llm_type_name(type);
  3352. }
  3353. std::string llama_model::desc() const {
  3354. return pimpl->desc_str;
  3355. }
  3356. size_t llama_model::size() const {
  3357. return pimpl->n_bytes;
  3358. }
  3359. size_t llama_model::n_tensors() const {
  3360. return tensors_by_name.size();
  3361. }
  3362. size_t llama_model::n_devices() const {
  3363. return devices.size();
  3364. }
  3365. uint64_t llama_model::n_elements() const {
  3366. return pimpl->n_elements;
  3367. }
  3368. void llama_model::print_info() const {
  3369. const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
  3370. auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
  3371. bool is_var = false;
  3372. std::vector<uint32_t> v;
  3373. for (uint32_t i = 0; i < n; ++i) {
  3374. v.push_back(f(i));
  3375. if (v[i] != v[0]) {
  3376. is_var = true;
  3377. }
  3378. }
  3379. std::stringstream ss;
  3380. if (is_var) {
  3381. ss << "[";
  3382. for (uint32_t i = 0; i < n; ++i) {
  3383. ss << v[i];
  3384. if (i < n - 1) {
  3385. ss << ", ";
  3386. }
  3387. }
  3388. ss << "]";
  3389. } else {
  3390. ss << v[0];
  3391. }
  3392. return ss.str();
  3393. };
  3394. // hparams
  3395. LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
  3396. LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
  3397. if (!hparams.vocab_only) {
  3398. LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
  3399. LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
  3400. LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
  3401. LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
  3402. LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
  3403. LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
  3404. LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
  3405. LLAMA_LOG_INFO("%s: n_swa_pattern = %u\n", __func__, hparams.n_swa_pattern);
  3406. LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
  3407. LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
  3408. LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
  3409. LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
  3410. LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
  3411. LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
  3412. LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
  3413. LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
  3414. LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
  3415. LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
  3416. LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
  3417. LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
  3418. LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
  3419. LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
  3420. LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
  3421. LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
  3422. LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
  3423. LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
  3424. LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
  3425. LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
  3426. LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
  3427. LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
  3428. LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
  3429. LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
  3430. LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
  3431. LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
  3432. LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
  3433. }
  3434. LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
  3435. if (pimpl->n_elements >= 1e12) {
  3436. LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
  3437. } else if (pimpl->n_elements >= 1e9) {
  3438. LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
  3439. } else if (pimpl->n_elements >= 1e6) {
  3440. LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
  3441. } else {
  3442. LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
  3443. }
  3444. // general kv
  3445. LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
  3446. if (arch == LLM_ARCH_DEEPSEEK) {
  3447. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  3448. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  3449. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  3450. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  3451. }
  3452. if (arch == LLM_ARCH_DEEPSEEK2) {
  3453. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  3454. LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
  3455. LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
  3456. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  3457. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  3458. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  3459. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  3460. LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
  3461. LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
  3462. }
  3463. if (arch == LLM_ARCH_QWEN2MOE) {
  3464. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  3465. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  3466. }
  3467. if (arch == LLM_ARCH_MINICPM || arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE) {
  3468. LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
  3469. LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
  3470. LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
  3471. }
  3472. if (arch == LLM_ARCH_BAILINGMOE) {
  3473. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  3474. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  3475. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  3476. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  3477. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  3478. }
  3479. vocab.print_info();
  3480. }
  3481. ggml_backend_dev_t llama_model::dev_layer(int il) const {
  3482. return pimpl->dev_layer.at(il).dev;
  3483. }
  3484. ggml_backend_dev_t llama_model::dev_output() const {
  3485. return pimpl->dev_output.dev;
  3486. }
  3487. template<typename F>
  3488. static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
  3489. ggml_init_params params = {
  3490. /*.mem_size =*/ ggml_tensor_overhead()*8,
  3491. /*.mem_buffer =*/ NULL,
  3492. /*.no_alloc =*/ true,
  3493. };
  3494. ggml_context_ptr ctx { ggml_init(params) };
  3495. if (!ctx) {
  3496. throw std::runtime_error(format("failed to create ggml context"));
  3497. }
  3498. ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
  3499. ggml_tensor * op_tensor = fn(ctx.get());
  3500. for (int i = 0; i < GGML_MAX_SRC; i++) {
  3501. if (op_tensor->src[i] != nullptr) {
  3502. assert(op_tensor->src[i]->buffer == nullptr);
  3503. op_tensor->src[i]->buffer = buf.get();
  3504. }
  3505. }
  3506. bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  3507. return op_supported;
  3508. }
  3509. template<typename F>
  3510. static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
  3511. for (const auto & cur : buft_list) {
  3512. ggml_backend_dev_t cur_dev = cur.first;
  3513. ggml_backend_buffer_type_t cur_buft = cur.second;
  3514. if (buft_supported(cur_buft, cur_dev, fn)) {
  3515. return cur_buft;
  3516. }
  3517. }
  3518. throw std::runtime_error(format("no suitable buffer type found"));
  3519. }
  3520. ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
  3521. return ::select_buft(
  3522. *pimpl->dev_layer.at(il).buft_list,
  3523. [&](ggml_context * ctx) {
  3524. ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  3525. ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  3526. return ggml_add(ctx, cur, layer_dir);
  3527. });
  3528. }
  3529. bool llama_model::has_tensor_overrides() const {
  3530. return pimpl->has_tensor_overrides;
  3531. }
  3532. const ggml_tensor * llama_model::get_tensor(const char * name) const {
  3533. auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
  3534. [name](const std::pair<std::string, ggml_tensor *> & it) {
  3535. return it.first == name;
  3536. });
  3537. if (it == tensors_by_name.end()) {
  3538. return nullptr;
  3539. }
  3540. return it->second;
  3541. }
  3542. struct llm_build_llama : public llm_graph_context {
  3543. llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  3544. const int64_t n_embd_head = hparams.n_embd_head_v;
  3545. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  3546. GGML_ASSERT(n_embd_head == hparams.n_rot);
  3547. ggml_tensor * cur;
  3548. ggml_tensor * inpL;
  3549. inpL = build_inp_embd(model.tok_embd);
  3550. // inp_pos - contains the positions
  3551. ggml_tensor * inp_pos = build_inp_pos();
  3552. // temperature tuning
  3553. ggml_tensor * inp_attn_scale = nullptr;
  3554. if (arch == LLM_ARCH_LLAMA4) {
  3555. inp_attn_scale = build_inp_attn_scale();
  3556. }
  3557. auto * inp_attn = build_attn_inp_kv_unified();
  3558. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  3559. for (int il = 0; il < n_layer; ++il) {
  3560. ggml_tensor * inpSA = inpL;
  3561. bool use_rope = arch == LLM_ARCH_LLAMA4
  3562. ? (il + 1) % hparams.n_no_rope_layer_step != 0
  3563. : true;
  3564. // norm
  3565. cur = build_norm(inpL,
  3566. model.layers[il].attn_norm, NULL,
  3567. LLM_NORM_RMS, il);
  3568. cb(cur, "attn_norm", il);
  3569. // self-attention
  3570. {
  3571. // rope freq factors for llama3; may return nullptr for llama2 and other models
  3572. ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
  3573. // compute Q and K and RoPE them
  3574. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  3575. cb(Qcur, "Qcur", il);
  3576. if (model.layers[il].bq) {
  3577. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  3578. cb(Qcur, "Qcur", il);
  3579. }
  3580. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  3581. cb(Kcur, "Kcur", il);
  3582. if (model.layers[il].bk) {
  3583. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  3584. cb(Kcur, "Kcur", il);
  3585. }
  3586. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  3587. cb(Vcur, "Vcur", il);
  3588. if (model.layers[il].bv) {
  3589. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  3590. cb(Vcur, "Vcur", il);
  3591. }
  3592. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  3593. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  3594. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  3595. if (use_rope) {
  3596. Qcur = ggml_rope_ext(
  3597. ctx0, Qcur, inp_pos, rope_factors,
  3598. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  3599. ext_factor, attn_factor, beta_fast, beta_slow
  3600. );
  3601. Kcur = ggml_rope_ext(
  3602. ctx0, Kcur, inp_pos, rope_factors,
  3603. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  3604. ext_factor, attn_factor, beta_fast, beta_slow
  3605. );
  3606. } else if (inp_attn_scale) {
  3607. Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
  3608. }
  3609. cb(Qcur, "Qcur", il);
  3610. cb(Kcur, "Kcur", il);
  3611. cb(Vcur, "Vcur", il);
  3612. if (arch == LLM_ARCH_LLAMA4 && use_rope && hparams.use_kq_norm) {
  3613. // Llama4TextL2Norm
  3614. Qcur = ggml_rms_norm(ctx0, Qcur, 1e-6);
  3615. Kcur = ggml_rms_norm(ctx0, Kcur, 1e-6);
  3616. cb(Qcur, "Qcur_normed", il);
  3617. cb(Kcur, "Kcur_normed", il);
  3618. }
  3619. cur = build_attn(inp_attn, gf,
  3620. model.layers[il].wo, model.layers[il].bo,
  3621. Qcur, Kcur, Vcur, nullptr, kq_scale, il);
  3622. cb(cur, "attn_out", il);
  3623. }
  3624. if (il == n_layer - 1) {
  3625. // skip computing output for unused tokens
  3626. ggml_tensor * inp_out_ids = build_inp_out_ids();
  3627. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  3628. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  3629. }
  3630. // For Granite architecture
  3631. if (hparams.f_residual_scale) {
  3632. cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
  3633. }
  3634. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  3635. cb(ffn_inp, "ffn_inp", il);
  3636. // feed-forward network (non-MoE)
  3637. if (model.layers[il].ffn_gate_inp == nullptr) {
  3638. cur = build_norm(ffn_inp,
  3639. model.layers[il].ffn_norm, NULL,
  3640. LLM_NORM_RMS, il);
  3641. cb(cur, "ffn_norm", il);
  3642. cur = build_ffn(cur,
  3643. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  3644. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  3645. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  3646. NULL,
  3647. LLM_FFN_SILU, LLM_FFN_PAR, il);
  3648. cb(cur, "ffn_out", il);
  3649. } else if (arch == LLM_ARCH_LLAMA4) {
  3650. // llama4 MoE
  3651. ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
  3652. model.layers[il].ffn_norm, NULL,
  3653. LLM_NORM_RMS, il);
  3654. cb(cur, "ffn_norm", il);
  3655. ggml_tensor * moe_out = build_moe_ffn(ffn_inp_normed,
  3656. model.layers[il].ffn_gate_inp,
  3657. model.layers[il].ffn_up_exps,
  3658. model.layers[il].ffn_gate_exps,
  3659. model.layers[il].ffn_down_exps,
  3660. nullptr,
  3661. n_expert, n_expert_used,
  3662. LLM_FFN_SILU, false,
  3663. false, 0.0,
  3664. LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
  3665. il);
  3666. // Shared experts
  3667. ggml_tensor * shexp_out = build_ffn(ffn_inp_normed,
  3668. model.layers[il].ffn_up_shexp, NULL, NULL,
  3669. model.layers[il].ffn_gate_shexp, NULL, NULL,
  3670. model.layers[il].ffn_down_shexp, NULL, NULL,
  3671. NULL,
  3672. LLM_FFN_SILU, LLM_FFN_PAR, il);
  3673. cb(shexp_out, "ffn_moe_shexp", il);
  3674. cur = ggml_add(ctx0, moe_out, shexp_out);
  3675. cb(cur, "ffn_moe_out_merged", il);
  3676. } else {
  3677. // MoE branch
  3678. cur = build_norm(ffn_inp,
  3679. model.layers[il].ffn_norm, NULL,
  3680. LLM_NORM_RMS, il);
  3681. cb(cur, "ffn_norm", il);
  3682. cur = build_moe_ffn(cur,
  3683. model.layers[il].ffn_gate_inp,
  3684. model.layers[il].ffn_up_exps,
  3685. model.layers[il].ffn_gate_exps,
  3686. model.layers[il].ffn_down_exps,
  3687. nullptr,
  3688. n_expert, n_expert_used,
  3689. LLM_FFN_SILU, true,
  3690. false, 0.0,
  3691. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  3692. il);
  3693. cb(cur, "ffn_moe_out", il);
  3694. }
  3695. // For Granite architecture
  3696. if (hparams.f_residual_scale) {
  3697. cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
  3698. }
  3699. cur = ggml_add(ctx0, cur, ffn_inp);
  3700. cb(cur, "ffn_out", il);
  3701. cur = build_cvec(cur, il);
  3702. cb(cur, "l_out", il);
  3703. // input for next layer
  3704. inpL = cur;
  3705. }
  3706. cur = inpL;
  3707. cur = build_norm(cur,
  3708. model.output_norm, NULL,
  3709. LLM_NORM_RMS, -1);
  3710. cb(cur, "result_norm", -1);
  3711. res->t_embd = cur;
  3712. // lm_head
  3713. cur = build_lora_mm(model.output, cur);
  3714. // For Granite architecture
  3715. if (hparams.f_logit_scale) {
  3716. cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
  3717. }
  3718. cb(cur, "result_output", -1);
  3719. res->t_logits = cur;
  3720. ggml_build_forward_expand(gf, cur);
  3721. }
  3722. };
  3723. struct llm_build_deci : public llm_graph_context {
  3724. llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  3725. const int64_t n_embd_head = hparams.n_embd_head_v;
  3726. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  3727. GGML_ASSERT(n_embd_head == hparams.n_rot);
  3728. ggml_tensor * cur;
  3729. ggml_tensor * inpL;
  3730. inpL = build_inp_embd(model.tok_embd);
  3731. // inp_pos - contains the positions
  3732. ggml_tensor * inp_pos = build_inp_pos();
  3733. auto * inp_attn = build_attn_inp_kv_unified();
  3734. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  3735. for (int il = 0; il < n_layer; ++il) {
  3736. ggml_tensor * inpSA = inpL;
  3737. const int64_t n_head_kv = hparams.n_head_kv(il);
  3738. const int64_t n_head = hparams.n_head(il);
  3739. if (n_head == 0) {
  3740. // attention-free layer of Llama-3_1-Nemotron-51B
  3741. cur = inpL;
  3742. } else {
  3743. // norm
  3744. cur = build_norm(inpL,
  3745. model.layers[il].attn_norm, NULL,
  3746. LLM_NORM_RMS, il);
  3747. cb(cur, "attn_norm", il);
  3748. }
  3749. if (n_head > 0 && n_head_kv == 0) {
  3750. // "linear attention" of Llama-3_1-Nemotron-51B
  3751. cur = build_lora_mm(model.layers[il].wo, cur);
  3752. cb(cur, "wo", il);
  3753. } else if (n_head > 0) {
  3754. // self-attention
  3755. // rope freq factors for llama3; may return nullptr for llama2 and other models
  3756. ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
  3757. // compute Q and K and RoPE them
  3758. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  3759. cb(Qcur, "Qcur", il);
  3760. if (model.layers[il].bq) {
  3761. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  3762. cb(Qcur, "Qcur", il);
  3763. }
  3764. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  3765. cb(Kcur, "Kcur", il);
  3766. if (model.layers[il].bk) {
  3767. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  3768. cb(Kcur, "Kcur", il);
  3769. }
  3770. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  3771. cb(Vcur, "Vcur", il);
  3772. if (model.layers[il].bv) {
  3773. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  3774. cb(Vcur, "Vcur", il);
  3775. }
  3776. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  3777. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  3778. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  3779. Qcur = ggml_rope_ext(
  3780. ctx0, Qcur, inp_pos, rope_factors,
  3781. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  3782. ext_factor, attn_factor, beta_fast, beta_slow
  3783. );
  3784. Kcur = ggml_rope_ext(
  3785. ctx0, Kcur, inp_pos, rope_factors,
  3786. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  3787. ext_factor, attn_factor, beta_fast, beta_slow
  3788. );
  3789. cb(Qcur, "Qcur", il);
  3790. cb(Kcur, "Kcur", il);
  3791. cb(Vcur, "Vcur", il);
  3792. cur = build_attn(inp_attn, gf,
  3793. model.layers[il].wo, model.layers[il].bo,
  3794. Qcur, Kcur, Vcur, nullptr, kq_scale, il);
  3795. }
  3796. if (il == n_layer - 1) {
  3797. // skip computing output for unused tokens
  3798. ggml_tensor * inp_out_ids = build_inp_out_ids();
  3799. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  3800. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  3801. }
  3802. // For Granite architecture
  3803. if (hparams.f_residual_scale) {
  3804. cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
  3805. }
  3806. // modified to support attention-free layer of Llama-3_1-Nemotron-51B
  3807. ggml_tensor * ffn_inp = cur;
  3808. if (n_head > 0) {
  3809. ffn_inp = ggml_add(ctx0, cur, inpSA);
  3810. cb(ffn_inp, "ffn_inp", il);
  3811. }
  3812. // feed-forward network
  3813. if (model.layers[il].ffn_gate_inp == nullptr) {
  3814. cur = build_norm(ffn_inp,
  3815. model.layers[il].ffn_norm, NULL,
  3816. LLM_NORM_RMS, il);
  3817. cb(cur, "ffn_norm", il);
  3818. cur = build_ffn(cur,
  3819. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  3820. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  3821. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  3822. NULL,
  3823. LLM_FFN_SILU, LLM_FFN_PAR, il);
  3824. cb(cur, "ffn_out", il);
  3825. }
  3826. // For Granite architecture
  3827. if (hparams.f_residual_scale) {
  3828. cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
  3829. }
  3830. cur = ggml_add(ctx0, cur, ffn_inp);
  3831. cb(cur, "ffn_out", il);
  3832. cur = build_cvec(cur, il);
  3833. cb(cur, "l_out", il);
  3834. // input for next layer
  3835. inpL = cur;
  3836. }
  3837. cur = inpL;
  3838. cur = build_norm(cur,
  3839. model.output_norm, NULL,
  3840. LLM_NORM_RMS, -1);
  3841. cb(cur, "result_norm", -1);
  3842. res->t_embd = cur;
  3843. // lm_head
  3844. cur = build_lora_mm(model.output, cur);
  3845. // For Granite architecture
  3846. if (hparams.f_logit_scale) {
  3847. cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
  3848. }
  3849. cb(cur, "result_output", -1);
  3850. res->t_logits = cur;
  3851. ggml_build_forward_expand(gf, cur);
  3852. }
  3853. };
  3854. struct llm_build_baichuan : public llm_graph_context {
  3855. llm_build_baichuan(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  3856. const int64_t n_embd_head = hparams.n_embd_head_v;
  3857. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  3858. GGML_ASSERT(n_embd_head == hparams.n_rot);
  3859. ggml_tensor * cur;
  3860. ggml_tensor * inpL;
  3861. inpL = build_inp_embd(model.tok_embd);
  3862. // inp_pos - contains the positions
  3863. ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
  3864. auto * inp_attn = build_attn_inp_kv_unified();
  3865. for (int il = 0; il < n_layer; ++il) {
  3866. ggml_tensor * inpSA = inpL;
  3867. cur = build_norm(inpL,
  3868. model.layers[il].attn_norm, NULL,
  3869. LLM_NORM_RMS, il);
  3870. cb(cur, "attn_norm", il);
  3871. // self-attention
  3872. {
  3873. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  3874. cb(Qcur, "Qcur", il);
  3875. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  3876. cb(Kcur, "Kcur", il);
  3877. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  3878. cb(Vcur, "Vcur", il);
  3879. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  3880. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  3881. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  3882. switch (model.type) {
  3883. case LLM_TYPE_7B:
  3884. Qcur = ggml_rope_ext(
  3885. ctx0, Qcur, inp_pos, nullptr,
  3886. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  3887. ext_factor, attn_factor, beta_fast, beta_slow
  3888. );
  3889. Kcur = ggml_rope_ext(
  3890. ctx0, Kcur, inp_pos, nullptr,
  3891. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  3892. ext_factor, attn_factor, beta_fast, beta_slow
  3893. );
  3894. break;
  3895. case LLM_TYPE_13B:
  3896. break;
  3897. default:
  3898. GGML_ABORT("fatal error");
  3899. }
  3900. cb(Qcur, "Qcur", il);
  3901. cb(Kcur, "Kcur", il);
  3902. cb(Vcur, "Vcur", il);
  3903. cur = build_attn(inp_attn, gf,
  3904. model.layers[il].wo, NULL,
  3905. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  3906. }
  3907. if (il == n_layer - 1) {
  3908. // skip computing output for unused tokens
  3909. ggml_tensor * inp_out_ids = build_inp_out_ids();
  3910. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  3911. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  3912. }
  3913. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  3914. cb(ffn_inp, "ffn_inp", il);
  3915. // feed-forward network
  3916. {
  3917. cur = build_norm(ffn_inp,
  3918. model.layers[il].ffn_norm, NULL,
  3919. LLM_NORM_RMS, il);
  3920. cb(cur, "ffn_norm", il);
  3921. cur = build_ffn(cur,
  3922. model.layers[il].ffn_up, NULL, NULL,
  3923. model.layers[il].ffn_gate, NULL, NULL,
  3924. model.layers[il].ffn_down, NULL, NULL,
  3925. NULL,
  3926. LLM_FFN_SILU, LLM_FFN_PAR, il);
  3927. cb(cur, "ffn_out", il);
  3928. }
  3929. cur = ggml_add(ctx0, cur, ffn_inp);
  3930. cur = build_cvec(cur, il);
  3931. cb(cur, "l_out", il);
  3932. // input for next layer
  3933. inpL = cur;
  3934. }
  3935. cur = inpL;
  3936. cur = build_norm(cur,
  3937. model.output_norm, NULL,
  3938. LLM_NORM_RMS, -1);
  3939. cb(cur, "result_norm", -1);
  3940. res->t_embd = cur;
  3941. // lm_head
  3942. cur = build_lora_mm(model.output, cur);
  3943. cb(cur, "result_output", -1);
  3944. res->t_logits = cur;
  3945. ggml_build_forward_expand(gf, cur);
  3946. }
  3947. };
  3948. struct llm_build_xverse : public llm_graph_context {
  3949. llm_build_xverse(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  3950. const int64_t n_embd_head = hparams.n_embd_head_v;
  3951. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  3952. GGML_ASSERT(n_embd_head == hparams.n_rot);
  3953. ggml_tensor * cur;
  3954. ggml_tensor * inpL;
  3955. inpL = build_inp_embd(model.tok_embd);
  3956. // inp_pos - contains the positions
  3957. ggml_tensor * inp_pos = build_inp_pos();
  3958. auto * inp_attn = build_attn_inp_kv_unified();
  3959. for (int il = 0; il < n_layer; ++il) {
  3960. ggml_tensor * inpSA = inpL;
  3961. cur = build_norm(inpL,
  3962. model.layers[il].attn_norm, NULL,
  3963. LLM_NORM_RMS, il);
  3964. cb(cur, "attn_norm", il);
  3965. // self-attention
  3966. {
  3967. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  3968. cb(Qcur, "Qcur", il);
  3969. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  3970. cb(Kcur, "Kcur", il);
  3971. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  3972. cb(Vcur, "Vcur", il);
  3973. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  3974. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  3975. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  3976. Qcur = ggml_rope_ext(
  3977. ctx0, Qcur, inp_pos, nullptr,
  3978. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  3979. ext_factor, attn_factor, beta_fast, beta_slow
  3980. );
  3981. Kcur = ggml_rope_ext(
  3982. ctx0, Kcur, inp_pos, nullptr,
  3983. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  3984. ext_factor, attn_factor, beta_fast, beta_slow
  3985. );
  3986. cb(Qcur, "Qcur", il);
  3987. cb(Kcur, "Kcur", il);
  3988. cb(Vcur, "Vcur", il);
  3989. cur = build_attn(inp_attn, gf,
  3990. model.layers[il].wo, NULL,
  3991. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  3992. }
  3993. if (il == n_layer - 1) {
  3994. // skip computing output for unused tokens
  3995. ggml_tensor * inp_out_ids = build_inp_out_ids();
  3996. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  3997. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  3998. }
  3999. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  4000. cb(ffn_inp, "ffn_inp", il);
  4001. // feed-forward network
  4002. {
  4003. cur = build_norm(ffn_inp,
  4004. model.layers[il].ffn_norm, NULL,
  4005. LLM_NORM_RMS, il);
  4006. cb(cur, "ffn_norm", il);
  4007. cur = build_ffn(cur,
  4008. model.layers[il].ffn_up, NULL, NULL,
  4009. model.layers[il].ffn_gate, NULL, NULL,
  4010. model.layers[il].ffn_down, NULL, NULL,
  4011. NULL,
  4012. LLM_FFN_SILU, LLM_FFN_PAR, il);
  4013. cb(cur, "ffn_out", il);
  4014. }
  4015. cur = ggml_add(ctx0, cur, ffn_inp);
  4016. cur = build_cvec(cur, il);
  4017. cb(cur, "l_out", il);
  4018. // input for next layer
  4019. inpL = cur;
  4020. }
  4021. cur = inpL;
  4022. cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
  4023. cb(cur, "result_norm", -1);
  4024. res->t_embd = cur;
  4025. // lm_head
  4026. cur = build_lora_mm(model.output, cur);
  4027. cb(cur, "result_output", -1);
  4028. res->t_logits = cur;
  4029. ggml_build_forward_expand(gf, cur);
  4030. }
  4031. };
  4032. struct llm_build_falcon : public llm_graph_context {
  4033. llm_build_falcon(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  4034. const int64_t n_embd_head = hparams.n_embd_head_v;
  4035. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  4036. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4037. GGML_ASSERT(n_embd_head == hparams.n_rot);
  4038. ggml_tensor * cur;
  4039. ggml_tensor * inpL;
  4040. inpL = build_inp_embd(model.tok_embd);
  4041. // inp_pos - contains the positions
  4042. ggml_tensor * inp_pos = build_inp_pos();
  4043. auto * inp_attn = build_attn_inp_kv_unified();
  4044. for (int il = 0; il < n_layer; ++il) {
  4045. ggml_tensor * attn_norm;
  4046. attn_norm = build_norm(inpL,
  4047. model.layers[il].attn_norm,
  4048. model.layers[il].attn_norm_b,
  4049. LLM_NORM, il);
  4050. cb(attn_norm, "attn_norm", il);
  4051. // self-attention
  4052. {
  4053. if (model.layers[il].attn_norm_2) {
  4054. // Falcon-40B
  4055. cur = build_norm(inpL,
  4056. model.layers[il].attn_norm_2,
  4057. model.layers[il].attn_norm_2_b,
  4058. LLM_NORM, il);
  4059. cb(cur, "attn_norm_2", il);
  4060. } else {
  4061. cur = attn_norm;
  4062. }
  4063. cur = build_lora_mm(model.layers[il].wqkv, cur);
  4064. cb(cur, "wqkv", il);
  4065. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4066. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4067. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  4068. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4069. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4070. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4071. // using mode = 2 for neox mode
  4072. Qcur = ggml_rope_ext(
  4073. ctx0, Qcur, inp_pos, nullptr,
  4074. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4075. ext_factor, attn_factor, beta_fast, beta_slow
  4076. );
  4077. Kcur = ggml_rope_ext(
  4078. ctx0, Kcur, inp_pos, nullptr,
  4079. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4080. ext_factor, attn_factor, beta_fast, beta_slow
  4081. );
  4082. cb(Qcur, "Qcur", il);
  4083. cb(Kcur, "Kcur", il);
  4084. cb(Vcur, "Vcur", il);
  4085. cur = build_attn(inp_attn, gf,
  4086. model.layers[il].wo, NULL,
  4087. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  4088. }
  4089. if (il == n_layer - 1) {
  4090. // skip computing output for unused tokens
  4091. ggml_tensor * inp_out_ids = build_inp_out_ids();
  4092. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4093. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  4094. attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
  4095. }
  4096. ggml_tensor * ffn_inp = cur;
  4097. // feed forward
  4098. {
  4099. cur = build_ffn(attn_norm, // !! use the attn norm, not the result
  4100. model.layers[il].ffn_up, NULL, NULL,
  4101. NULL, NULL, NULL,
  4102. model.layers[il].ffn_down, NULL, NULL,
  4103. NULL,
  4104. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  4105. cb(cur, "ffn_out", il);
  4106. }
  4107. cur = ggml_add(ctx0, cur, ffn_inp);
  4108. cur = ggml_add(ctx0, cur, inpL);
  4109. cur = build_cvec(cur, il);
  4110. cb(cur, "l_out", il);
  4111. // input for next layer
  4112. inpL = cur;
  4113. }
  4114. cur = inpL;
  4115. // norm
  4116. cur = build_norm(cur,
  4117. model.output_norm,
  4118. model.output_norm_b,
  4119. LLM_NORM, -1);
  4120. cb(cur, "result_norm", -1);
  4121. res->t_embd = cur;
  4122. cur = build_lora_mm(model.output, cur);
  4123. cb(cur, "result_output", -1);
  4124. res->t_logits = cur;
  4125. ggml_build_forward_expand(gf, cur);
  4126. }
  4127. };
  4128. struct llm_build_grok : public llm_graph_context {
  4129. llm_build_grok(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  4130. const int64_t n_embd_head = hparams.n_embd_head_v;
  4131. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4132. GGML_ASSERT(n_embd_head == hparams.n_rot);
  4133. ggml_tensor * cur;
  4134. ggml_tensor * inpL;
  4135. inpL = build_inp_embd(model.tok_embd);
  4136. // multiply by embedding_multiplier_scale of 78.38367176906169
  4137. inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
  4138. // inp_pos - contains the positions
  4139. ggml_tensor * inp_pos = build_inp_pos();
  4140. auto * inp_attn = build_attn_inp_kv_unified();
  4141. for (int il = 0; il < n_layer; ++il) {
  4142. ggml_tensor * inpSA = inpL;
  4143. // norm
  4144. cur = build_norm(inpL,
  4145. model.layers[il].attn_norm, NULL,
  4146. LLM_NORM_RMS, il);
  4147. cb(cur, "attn_norm", il);
  4148. // self-attention
  4149. {
  4150. // compute Q and K and RoPE them
  4151. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  4152. cb(Qcur, "Qcur", il);
  4153. if (model.layers[il].bq) {
  4154. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  4155. cb(Qcur, "Qcur", il);
  4156. }
  4157. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  4158. cb(Kcur, "Kcur", il);
  4159. if (model.layers[il].bk) {
  4160. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  4161. cb(Kcur, "Kcur", il);
  4162. }
  4163. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  4164. cb(Vcur, "Vcur", il);
  4165. if (model.layers[il].bv) {
  4166. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  4167. cb(Vcur, "Vcur", il);
  4168. }
  4169. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4170. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4171. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4172. Qcur = ggml_rope_ext(
  4173. ctx0, Qcur, inp_pos, nullptr,
  4174. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4175. ext_factor, attn_factor, beta_fast, beta_slow
  4176. );
  4177. Kcur = ggml_rope_ext(
  4178. ctx0, Kcur, inp_pos, nullptr,
  4179. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4180. ext_factor, attn_factor, beta_fast, beta_slow
  4181. );
  4182. cb(Qcur, "Qcur", il);
  4183. cb(Kcur, "Kcur", il);
  4184. cb(Vcur, "Vcur", il);
  4185. cur = build_attn(inp_attn, gf,
  4186. model.layers[il].wo, model.layers[il].bo,
  4187. Qcur, Kcur, Vcur, nullptr, 1.0f, il);
  4188. }
  4189. if (il == n_layer - 1) {
  4190. // skip computing output for unused tokens
  4191. ggml_tensor * inp_out_ids = build_inp_out_ids();
  4192. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4193. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  4194. }
  4195. // Grok
  4196. // if attn_out_norm is present then apply it before adding the input
  4197. if (model.layers[il].attn_out_norm) {
  4198. cur = build_norm(cur,
  4199. model.layers[il].attn_out_norm, NULL,
  4200. LLM_NORM_RMS, il);
  4201. cb(cur, "attn_out_norm", il);
  4202. }
  4203. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  4204. cb(ffn_inp, "ffn_inp", il);
  4205. // feed-forward network
  4206. // MoE branch
  4207. cur = build_norm(ffn_inp,
  4208. model.layers[il].ffn_norm, NULL,
  4209. LLM_NORM_RMS, il);
  4210. cb(cur, "ffn_norm", il);
  4211. cur = build_moe_ffn(cur,
  4212. model.layers[il].ffn_gate_inp,
  4213. model.layers[il].ffn_up_exps,
  4214. model.layers[il].ffn_gate_exps,
  4215. model.layers[il].ffn_down_exps,
  4216. nullptr,
  4217. n_expert, n_expert_used,
  4218. LLM_FFN_GELU, true,
  4219. false, 0.0,
  4220. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  4221. il);
  4222. cb(cur, "ffn_moe_out", il);
  4223. // Grok
  4224. // if layer_out_norm is present then apply it before adding the input
  4225. // Idea: maybe ffn_out_norm is a better name
  4226. if (model.layers[il].layer_out_norm) {
  4227. cur = build_norm(cur,
  4228. model.layers[il].layer_out_norm, NULL,
  4229. LLM_NORM_RMS, il);
  4230. cb(cur, "layer_out_norm", il);
  4231. }
  4232. cur = ggml_add(ctx0, cur, ffn_inp);
  4233. cb(cur, "ffn_out", il);
  4234. cur = build_cvec(cur, il);
  4235. cb(cur, "l_out", il);
  4236. // input for next layer
  4237. inpL = cur;
  4238. }
  4239. cur = inpL;
  4240. cur = build_norm(cur,
  4241. model.output_norm, NULL,
  4242. LLM_NORM_RMS, -1);
  4243. cb(cur, "result_norm", -1);
  4244. res->t_embd = cur;
  4245. // lm_head
  4246. cur = build_lora_mm(model.output, cur);
  4247. // Grok
  4248. // multiply logits by output_multiplier_scale of 0.5773502691896257
  4249. cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
  4250. cb(cur, "result_output", -1);
  4251. res->t_logits = cur;
  4252. ggml_build_forward_expand(gf, cur);
  4253. }
  4254. };
  4255. struct llm_build_dbrx : public llm_graph_context {
  4256. llm_build_dbrx(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  4257. const int64_t n_embd_head = hparams.n_embd_head_v;
  4258. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  4259. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4260. GGML_ASSERT(n_embd_head == hparams.n_rot);
  4261. ggml_tensor * cur;
  4262. ggml_tensor * inpL;
  4263. inpL = build_inp_embd(model.tok_embd);
  4264. // inp_pos - contains the positions
  4265. ggml_tensor * inp_pos = build_inp_pos();
  4266. auto * inp_attn = build_attn_inp_kv_unified();
  4267. for (int il = 0; il < n_layer; ++il) {
  4268. ggml_tensor * inpSA = inpL;
  4269. // norm
  4270. cur = build_norm(inpL,
  4271. model.layers[il].attn_norm, NULL,
  4272. LLM_NORM, il);
  4273. cb(cur, "attn_norm", il);
  4274. // self-attention
  4275. {
  4276. ggml_tensor * Qcur = nullptr;
  4277. ggml_tensor * Kcur = nullptr;
  4278. ggml_tensor * Vcur = nullptr;
  4279. cur = build_lora_mm(model.layers[il].wqkv, cur);
  4280. cb(cur, "wqkv", il);
  4281. cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  4282. cb(cur, "wqkv_clamped", il);
  4283. Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4284. Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4285. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  4286. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4287. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4288. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4289. Qcur = ggml_rope_ext(
  4290. ctx0, Qcur, inp_pos, nullptr,
  4291. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4292. ext_factor, attn_factor, beta_fast, beta_slow
  4293. );
  4294. Kcur = ggml_rope_ext(
  4295. ctx0, Kcur, inp_pos, nullptr,
  4296. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4297. ext_factor, attn_factor, beta_fast, beta_slow
  4298. );
  4299. cb(Qcur, "Qcur", il);
  4300. cb(Kcur, "Kcur", il);
  4301. cb(Vcur, "Vcur", il);
  4302. cur = build_attn(inp_attn, gf,
  4303. model.layers[il].wo, NULL,
  4304. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  4305. }
  4306. if (il == n_layer - 1) {
  4307. // skip computing output for unused tokens
  4308. ggml_tensor * inp_out_ids = build_inp_out_ids();
  4309. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4310. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  4311. }
  4312. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  4313. cb(ffn_inp, "ffn_inp", il);
  4314. // feed-forward network
  4315. // MoE branch
  4316. cur = build_norm(ffn_inp,
  4317. model.layers[il].attn_out_norm, NULL,
  4318. LLM_NORM, il);
  4319. cb(cur, "attn_out_norm", il);
  4320. cur = build_moe_ffn(cur,
  4321. model.layers[il].ffn_gate_inp,
  4322. model.layers[il].ffn_up_exps,
  4323. model.layers[il].ffn_gate_exps,
  4324. model.layers[il].ffn_down_exps,
  4325. nullptr,
  4326. n_expert, n_expert_used,
  4327. LLM_FFN_SILU, true,
  4328. false, 0.0,
  4329. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  4330. il);
  4331. cb(cur, "ffn_moe_out", il);
  4332. cur = ggml_add(ctx0, cur, ffn_inp);
  4333. cb(cur, "ffn_out", il);
  4334. cur = build_cvec(cur, il);
  4335. cb(cur, "l_out", il);
  4336. // input for next layer
  4337. inpL = cur;
  4338. }
  4339. cur = inpL;
  4340. cur = build_norm(cur,
  4341. model.output_norm, NULL,
  4342. LLM_NORM, -1);
  4343. cb(cur, "result_norm", -1);
  4344. res->t_embd = cur;
  4345. // lm_head
  4346. cur = build_lora_mm(model.output, cur);
  4347. cb(cur, "result_output", -1);
  4348. res->t_logits = cur;
  4349. ggml_build_forward_expand(gf, cur);
  4350. }
  4351. };
  4352. struct llm_build_starcoder : public llm_graph_context {
  4353. llm_build_starcoder(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  4354. const int64_t n_embd_head = hparams.n_embd_head_v;
  4355. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  4356. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4357. ggml_tensor * cur;
  4358. ggml_tensor * inpL;
  4359. inpL = build_inp_embd(model.tok_embd);
  4360. // inp_pos - contains the positions
  4361. ggml_tensor * inp_pos = build_inp_pos();
  4362. auto * inp_attn = build_attn_inp_kv_unified();
  4363. ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
  4364. cb(pos, "pos_embd", -1);
  4365. inpL = ggml_add(ctx0, inpL, pos);
  4366. cb(inpL, "inpL", -1);
  4367. for (int il = 0; il < n_layer; ++il) {
  4368. cur = build_norm(inpL,
  4369. model.layers[il].attn_norm,
  4370. model.layers[il].attn_norm_b,
  4371. LLM_NORM, il);
  4372. cb(cur, "attn_norm", il);
  4373. // self-attention
  4374. {
  4375. cur = build_lora_mm(model.layers[il].wqkv, cur);
  4376. cb(cur, "wqkv", il);
  4377. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  4378. cb(cur, "bqkv", il);
  4379. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4380. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4381. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  4382. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4383. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4384. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4385. cb(Qcur, "Qcur", il);
  4386. cb(Kcur, "Kcur", il);
  4387. cb(Vcur, "Vcur", il);
  4388. cur = build_attn(inp_attn, gf,
  4389. model.layers[il].wo, model.layers[il].bo,
  4390. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  4391. }
  4392. if (il == n_layer - 1) {
  4393. // skip computing output for unused tokens
  4394. ggml_tensor * inp_out_ids = build_inp_out_ids();
  4395. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4396. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  4397. }
  4398. // add the input
  4399. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  4400. cb(ffn_inp, "ffn_inp", il);
  4401. // FF
  4402. {
  4403. cur = build_norm(ffn_inp,
  4404. model.layers[il].ffn_norm,
  4405. model.layers[il].ffn_norm_b,
  4406. LLM_NORM, il);
  4407. cb(cur, "ffn_norm", il);
  4408. cur = build_ffn(cur,
  4409. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  4410. NULL, NULL, NULL,
  4411. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  4412. NULL,
  4413. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  4414. cb(cur, "ffn_out", il);
  4415. }
  4416. cur = ggml_add(ctx0, cur, ffn_inp);
  4417. cur = build_cvec(cur, il);
  4418. cb(cur, "l_out", il);
  4419. // input for next layer
  4420. inpL = cur;
  4421. }
  4422. cur = build_norm(inpL,
  4423. model.output_norm,
  4424. model.output_norm_b,
  4425. LLM_NORM, -1);
  4426. cb(cur, "result_norm", -1);
  4427. res->t_embd = cur;
  4428. cur = build_lora_mm(model.output, cur);
  4429. cb(cur, "result_output", -1);
  4430. res->t_logits = cur;
  4431. ggml_build_forward_expand(gf, cur);
  4432. }
  4433. };
  4434. struct llm_build_refact : public llm_graph_context {
  4435. llm_build_refact(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  4436. const int64_t n_embd_head = hparams.n_embd_head_v;
  4437. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4438. ggml_tensor * cur;
  4439. ggml_tensor * inpL;
  4440. inpL = build_inp_embd(model.tok_embd);
  4441. auto * inp_attn = build_attn_inp_kv_unified();
  4442. for (int il = 0; il < n_layer; ++il) {
  4443. ggml_tensor * inpSA = inpL;
  4444. cur = build_norm(inpL,
  4445. model.layers[il].attn_norm, NULL,
  4446. LLM_NORM_RMS, il);
  4447. cb(cur, "attn_norm", il);
  4448. // self-attention
  4449. {
  4450. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  4451. cb(Qcur, "Qcur", il);
  4452. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  4453. cb(Kcur, "Kcur", il);
  4454. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  4455. cb(Vcur, "Vcur", il);
  4456. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4457. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4458. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4459. cb(Qcur, "Qcur", il);
  4460. cb(Kcur, "Kcur", il);
  4461. cb(Vcur, "Vcur", il);
  4462. cur = build_attn(inp_attn, gf,
  4463. model.layers[il].wo, NULL,
  4464. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  4465. }
  4466. if (il == n_layer - 1) {
  4467. // skip computing output for unused tokens
  4468. ggml_tensor * inp_out_ids = build_inp_out_ids();
  4469. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4470. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  4471. }
  4472. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  4473. cb(ffn_inp, "ffn_inp", il);
  4474. // feed-forward network
  4475. {
  4476. cur = build_norm(ffn_inp,
  4477. model.layers[il].ffn_norm, NULL,
  4478. LLM_NORM_RMS, il);
  4479. cb(cur, "ffn_norm", il);
  4480. cur = build_ffn(cur,
  4481. model.layers[il].ffn_up, NULL, NULL,
  4482. model.layers[il].ffn_gate, NULL, NULL,
  4483. model.layers[il].ffn_down, NULL, NULL,
  4484. NULL,
  4485. LLM_FFN_SILU, LLM_FFN_PAR, il);
  4486. cb(cur, "ffn_out", il);
  4487. }
  4488. cur = ggml_add(ctx0, cur, ffn_inp);
  4489. cur = build_cvec(cur, il);
  4490. cb(cur, "l_out", il);
  4491. // input for next layer
  4492. inpL = cur;
  4493. }
  4494. cur = inpL;
  4495. cur = build_norm(cur,
  4496. model.output_norm, NULL,
  4497. LLM_NORM_RMS, -1);
  4498. cb(cur, "result_norm", -1);
  4499. res->t_embd = cur;
  4500. // lm_head
  4501. cur = build_lora_mm(model.output, cur);
  4502. cb(cur, "result_output", -1);
  4503. res->t_logits = cur;
  4504. ggml_build_forward_expand(gf, cur);
  4505. }
  4506. };
  4507. struct llm_build_bert : public llm_graph_context {
  4508. llm_build_bert(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  4509. const int64_t n_embd_head = hparams.n_embd_head_v;
  4510. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  4511. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4512. ggml_tensor * cur;
  4513. ggml_tensor * inpL;
  4514. ggml_tensor * inp_pos = nullptr;
  4515. if (model.arch != LLM_ARCH_JINA_BERT_V2) {
  4516. inp_pos = build_inp_pos();
  4517. }
  4518. // construct input embeddings (token, type, position)
  4519. inpL = build_inp_embd(model.tok_embd);
  4520. // token types are hardcoded to zero ("Sentence A")
  4521. ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
  4522. inpL = ggml_add(ctx0, inpL, type_row0);
  4523. if (model.arch == LLM_ARCH_BERT) {
  4524. inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
  4525. }
  4526. cb(inpL, "inp_embd", -1);
  4527. // embed layer norm
  4528. inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
  4529. cb(inpL, "inp_norm", -1);
  4530. auto * inp_attn = build_attn_inp_no_cache();
  4531. // iterate layers
  4532. for (int il = 0; il < n_layer; ++il) {
  4533. ggml_tensor * cur = inpL;
  4534. ggml_tensor * Qcur;
  4535. ggml_tensor * Kcur;
  4536. ggml_tensor * Vcur;
  4537. // self-attention
  4538. if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
  4539. Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
  4540. if (model.layers[il].attn_q_norm) {
  4541. Qcur = build_norm(Qcur,
  4542. model.layers[il].attn_q_norm,
  4543. model.layers[il].attn_q_norm_b,
  4544. LLM_NORM, il);
  4545. }
  4546. Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
  4547. if (model.layers[il].attn_k_norm) {
  4548. Kcur = build_norm(Kcur,
  4549. model.layers[il].attn_k_norm,
  4550. model.layers[il].attn_k_norm_b,
  4551. LLM_NORM, il);
  4552. }
  4553. Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
  4554. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4555. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4556. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4557. } else {
  4558. // compute Q and K and RoPE them
  4559. cur = build_lora_mm(model.layers[il].wqkv, cur);
  4560. cb(cur, "wqkv", il);
  4561. Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4562. Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4563. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  4564. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4565. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4566. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4567. Qcur = ggml_rope_ext(
  4568. ctx0, Qcur, inp_pos, nullptr,
  4569. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4570. ext_factor, attn_factor, beta_fast, beta_slow
  4571. );
  4572. Kcur = ggml_rope_ext(
  4573. ctx0, Kcur, inp_pos, nullptr,
  4574. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4575. ext_factor, attn_factor, beta_fast, beta_slow
  4576. );
  4577. }
  4578. cb(Qcur, "Qcur", il);
  4579. cb(Kcur, "Kcur", il);
  4580. cb(Vcur, "Vcur", il);
  4581. cur = build_attn(inp_attn, gf,
  4582. model.layers[il].wo, model.layers[il].bo,
  4583. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  4584. cb(cur, "kqv_out", il);
  4585. if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
  4586. // skip computing output for unused tokens
  4587. ggml_tensor * inp_out_ids = build_inp_out_ids();
  4588. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4589. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  4590. }
  4591. // re-add the layer input
  4592. cur = ggml_add(ctx0, cur, inpL);
  4593. // attention layer norm
  4594. cur = build_norm(cur, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, il);
  4595. if (model.layers[il].attn_norm_2 != nullptr) {
  4596. cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
  4597. cur = build_norm(cur, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, il);
  4598. }
  4599. ggml_tensor * ffn_inp = cur;
  4600. cb(ffn_inp, "ffn_inp", il);
  4601. // feed-forward network
  4602. if (model.arch == LLM_ARCH_BERT) {
  4603. cur = build_ffn(cur,
  4604. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  4605. NULL, NULL, NULL,
  4606. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  4607. NULL,
  4608. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  4609. } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
  4610. cur = build_ffn(cur,
  4611. model.layers[il].ffn_up, NULL, NULL,
  4612. model.layers[il].ffn_gate, NULL, NULL,
  4613. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  4614. NULL,
  4615. LLM_FFN_GELU, LLM_FFN_PAR, il);
  4616. } else {
  4617. cur = build_ffn(cur,
  4618. model.layers[il].ffn_up, NULL, NULL,
  4619. model.layers[il].ffn_gate, NULL, NULL,
  4620. model.layers[il].ffn_down, NULL, NULL,
  4621. NULL,
  4622. LLM_FFN_SILU, LLM_FFN_PAR, il);
  4623. }
  4624. cb(cur, "ffn_out", il);
  4625. // attentions bypass the intermediate layer
  4626. cur = ggml_add(ctx0, cur, ffn_inp);
  4627. // output layer norm
  4628. cur = build_norm(cur, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, il);
  4629. // input for next layer
  4630. inpL = cur;
  4631. }
  4632. cur = inpL;
  4633. cb(cur, "result_embd", -1);
  4634. res->t_embd = cur;
  4635. ggml_build_forward_expand(gf, cur);
  4636. }
  4637. };
  4638. struct llm_build_bloom : public llm_graph_context {
  4639. llm_build_bloom(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  4640. const int64_t n_embd_head = hparams.n_embd_head_v;
  4641. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  4642. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4643. ggml_tensor * cur;
  4644. ggml_tensor * inpL;
  4645. inpL = build_inp_embd(model.tok_embd);
  4646. auto * inp_attn = build_attn_inp_kv_unified();
  4647. inpL = build_norm(inpL,
  4648. model.tok_norm,
  4649. model.tok_norm_b,
  4650. LLM_NORM, -1);
  4651. cb(inpL, "inp_norm", -1);
  4652. for (int il = 0; il < n_layer; ++il) {
  4653. cur = build_norm(inpL,
  4654. model.layers[il].attn_norm,
  4655. model.layers[il].attn_norm_b,
  4656. LLM_NORM, il);
  4657. cb(cur, "attn_norm", il);
  4658. // self-attention
  4659. {
  4660. cur = build_lora_mm(model.layers[il].wqkv, cur);
  4661. cb(cur, "wqkv", il);
  4662. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  4663. cb(cur, "bqkv", il);
  4664. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4665. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4666. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  4667. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4668. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4669. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4670. cb(Qcur, "Qcur", il);
  4671. cb(Kcur, "Kcur", il);
  4672. cb(Vcur, "Vcur", il);
  4673. cur = build_attn(inp_attn, gf,
  4674. model.layers[il].wo, model.layers[il].bo,
  4675. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  4676. }
  4677. if (il == n_layer - 1) {
  4678. // skip computing output for unused tokens
  4679. ggml_tensor * inp_out_ids = build_inp_out_ids();
  4680. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4681. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  4682. }
  4683. // Add the input
  4684. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  4685. cb(ffn_inp, "ffn_inp", il);
  4686. // FF
  4687. {
  4688. cur = build_norm(ffn_inp,
  4689. model.layers[il].ffn_norm,
  4690. model.layers[il].ffn_norm_b,
  4691. LLM_NORM, il);
  4692. cb(cur, "ffn_norm", il);
  4693. cur = build_ffn(cur,
  4694. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  4695. NULL, NULL, NULL,
  4696. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  4697. NULL,
  4698. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  4699. cb(cur, "ffn_out", il);
  4700. }
  4701. cur = ggml_add(ctx0, cur, ffn_inp);
  4702. cur = build_cvec(cur, il);
  4703. cb(cur, "l_out", il);
  4704. // input for next layer
  4705. inpL = cur;
  4706. }
  4707. cur = build_norm(inpL,
  4708. model.output_norm,
  4709. model.output_norm_b,
  4710. LLM_NORM, -1);
  4711. cb(cur, "result_norm", -1);
  4712. res->t_embd = cur;
  4713. cur = build_lora_mm(model.output, cur);
  4714. cb(cur, "result_output", -1);
  4715. res->t_logits = cur;
  4716. ggml_build_forward_expand(gf, cur);
  4717. }
  4718. };
  4719. struct llm_build_mpt : public llm_graph_context {
  4720. llm_build_mpt(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  4721. const int64_t n_embd_head = hparams.n_embd_head_v;
  4722. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  4723. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4724. ggml_tensor * cur;
  4725. ggml_tensor * pos;
  4726. ggml_tensor * inpL;
  4727. inpL = build_inp_embd(model.tok_embd);
  4728. auto * inp_attn = build_attn_inp_kv_unified();
  4729. if (model.pos_embd) {
  4730. // inp_pos - contains the positions
  4731. ggml_tensor * inp_pos = build_inp_pos();
  4732. pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
  4733. cb(pos, "pos_embd", -1);
  4734. inpL = ggml_add(ctx0, inpL, pos);
  4735. cb(inpL, "inpL", -1);
  4736. }
  4737. for (int il = 0; il < n_layer; ++il) {
  4738. ggml_tensor * attn_norm;
  4739. attn_norm = build_norm(inpL,
  4740. model.layers[il].attn_norm,
  4741. model.layers[il].attn_norm_b,
  4742. LLM_NORM, il);
  4743. cb(attn_norm, "attn_norm", il);
  4744. // self-attention
  4745. {
  4746. cur = attn_norm;
  4747. cur = build_lora_mm(model.layers[il].wqkv, cur);
  4748. cb(cur, "wqkv", il);
  4749. if (model.layers[il].bqkv){
  4750. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  4751. cb(cur, "bqkv", il);
  4752. }
  4753. if (hparams.f_clamp_kqv > 0.0f) {
  4754. cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  4755. cb(cur, "wqkv_clamped", il);
  4756. }
  4757. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4758. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4759. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  4760. cb(Qcur, "Qcur", il);
  4761. cb(Kcur, "Kcur", il);
  4762. cb(Vcur, "Vcur", il);
  4763. // Q/K Layernorm
  4764. if (model.layers[il].attn_q_norm) {
  4765. Qcur = build_norm(Qcur,
  4766. model.layers[il].attn_q_norm,
  4767. model.layers[il].attn_q_norm_b,
  4768. LLM_NORM, il);
  4769. cb(Qcur, "Qcur", il);
  4770. Kcur = build_norm(Kcur,
  4771. model.layers[il].attn_k_norm,
  4772. model.layers[il].attn_k_norm_b,
  4773. LLM_NORM, il);
  4774. cb(Kcur, "Kcur", il);
  4775. }
  4776. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4777. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4778. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4779. cb(Qcur, "Qcur", il);
  4780. cb(Kcur, "Kcur", il);
  4781. cb(Vcur, "Vcur", il);
  4782. cur = build_attn(inp_attn, gf,
  4783. model.layers[il].wo, model.layers[il].bo,
  4784. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  4785. }
  4786. if (il == n_layer - 1) {
  4787. // skip computing output for unused tokens
  4788. ggml_tensor * inp_out_ids = build_inp_out_ids();
  4789. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4790. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  4791. }
  4792. // Add the input
  4793. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  4794. cb(ffn_inp, "ffn_inp", il);
  4795. // feed forward
  4796. {
  4797. cur = build_norm(ffn_inp,
  4798. model.layers[il].ffn_norm,
  4799. model.layers[il].ffn_norm_b,
  4800. LLM_NORM, il);
  4801. cb(cur, "ffn_norm", il);
  4802. cur = build_ffn(cur,
  4803. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  4804. NULL, NULL, NULL,
  4805. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  4806. model.layers[il].ffn_act,
  4807. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  4808. cb(cur, "ffn_out", il);
  4809. }
  4810. cur = ggml_add(ctx0, cur, ffn_inp);
  4811. cur = build_cvec(cur, il);
  4812. cb(cur, "l_out", il);
  4813. // input for next layer
  4814. inpL = cur;
  4815. }
  4816. cur = inpL;
  4817. cur = build_norm(cur,
  4818. model.output_norm,
  4819. model.output_norm_b,
  4820. LLM_NORM, -1);
  4821. cb(cur, "result_norm", -1);
  4822. res->t_embd = cur;
  4823. cur = build_lora_mm(model.output, cur);
  4824. cb(cur, "result_output", -1);
  4825. res->t_logits = cur;
  4826. ggml_build_forward_expand(gf, cur);
  4827. }
  4828. };
  4829. struct llm_build_stablelm : public llm_graph_context {
  4830. llm_build_stablelm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  4831. const int64_t n_embd_head = hparams.n_embd_head_v;
  4832. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4833. ggml_tensor * cur;
  4834. ggml_tensor * inpL;
  4835. inpL = build_inp_embd(model.tok_embd);
  4836. // inp_pos - contains the positions
  4837. ggml_tensor * inp_pos = build_inp_pos();
  4838. auto * inp_attn = build_attn_inp_kv_unified();
  4839. for (int il = 0; il < n_layer; ++il) {
  4840. // norm
  4841. cur = build_norm(inpL,
  4842. model.layers[il].attn_norm,
  4843. model.layers[il].attn_norm_b,
  4844. LLM_NORM, il);
  4845. cb(cur, "attn_norm", il);
  4846. ggml_tensor * inpSA = cur;
  4847. // self-attention
  4848. {
  4849. // compute Q and K and RoPE them
  4850. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  4851. cb(Qcur, "Qcur", il);
  4852. if (model.layers[il].bq) {
  4853. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  4854. cb(Qcur, "Qcur", il);
  4855. }
  4856. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  4857. cb(Kcur, "Kcur", il);
  4858. if (model.layers[il].bk) {
  4859. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  4860. cb(Kcur, "Kcur", il);
  4861. }
  4862. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  4863. cb(Vcur, "Vcur", il);
  4864. if (model.layers[il].bv) {
  4865. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  4866. cb(Vcur, "Vcur", il);
  4867. }
  4868. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4869. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4870. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4871. if (model.layers[il].attn_q_norm) {
  4872. Qcur = build_norm(Qcur,
  4873. model.layers[il].attn_q_norm,
  4874. NULL,
  4875. LLM_NORM, il);
  4876. cb(Qcur, "Qcur", il);
  4877. }
  4878. if (model.layers[il].attn_k_norm) {
  4879. Kcur = build_norm(Kcur,
  4880. model.layers[il].attn_k_norm,
  4881. NULL,
  4882. LLM_NORM, il);
  4883. cb(Kcur, "Kcur", il);
  4884. }
  4885. Qcur = ggml_rope_ext(
  4886. ctx0, Qcur, inp_pos, nullptr,
  4887. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4888. ext_factor, attn_factor, beta_fast, beta_slow
  4889. );
  4890. Kcur = ggml_rope_ext(
  4891. ctx0, Kcur, inp_pos, nullptr,
  4892. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4893. ext_factor, attn_factor, beta_fast, beta_slow
  4894. );
  4895. cb(Qcur, "Qcur", il);
  4896. cb(Kcur, "Kcur", il);
  4897. cb(Vcur, "Vcur", il);
  4898. cur = build_attn(inp_attn, gf,
  4899. model.layers[il].wo, NULL,
  4900. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  4901. }
  4902. if (il == n_layer - 1) {
  4903. // skip computing output for unused tokens
  4904. ggml_tensor * inp_out_ids = build_inp_out_ids();
  4905. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4906. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  4907. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  4908. }
  4909. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  4910. cb(ffn_inp, "ffn_inp", il);
  4911. // feed-forward network
  4912. {
  4913. if (model.layers[il].ffn_norm) {
  4914. cur = build_norm(ffn_inp,
  4915. model.layers[il].ffn_norm,
  4916. model.layers[il].ffn_norm_b,
  4917. LLM_NORM, il);
  4918. cb(cur, "ffn_norm", il);
  4919. } else {
  4920. // parallel residual
  4921. cur = inpSA;
  4922. }
  4923. cur = build_ffn(cur,
  4924. model.layers[il].ffn_up, NULL, NULL,
  4925. model.layers[il].ffn_gate, NULL, NULL,
  4926. model.layers[il].ffn_down, NULL, NULL,
  4927. NULL,
  4928. LLM_FFN_SILU, LLM_FFN_PAR, il);
  4929. cb(cur, "ffn_out", il);
  4930. }
  4931. cur = ggml_add(ctx0, cur, ffn_inp);
  4932. cur = build_cvec(cur, il);
  4933. cb(cur, "l_out", il);
  4934. // input for next layer
  4935. inpL = cur;
  4936. }
  4937. cur = inpL;
  4938. cur = build_norm(cur,
  4939. model.output_norm,
  4940. model.output_norm_b,
  4941. LLM_NORM, -1);
  4942. cb(cur, "result_norm", -1);
  4943. res->t_embd = cur;
  4944. // lm_head
  4945. cur = build_lora_mm(model.output, cur);
  4946. cb(cur, "result_output", -1);
  4947. res->t_logits = cur;
  4948. ggml_build_forward_expand(gf, cur);
  4949. }
  4950. };
  4951. struct llm_build_qwen : public llm_graph_context {
  4952. llm_build_qwen(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  4953. const int64_t n_embd_head = hparams.n_embd_head_v;
  4954. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4955. ggml_tensor * cur;
  4956. ggml_tensor * inpL;
  4957. inpL = build_inp_embd(model.tok_embd);
  4958. // inp_pos - contains the positions
  4959. ggml_tensor * inp_pos = build_inp_pos();
  4960. auto * inp_attn = build_attn_inp_kv_unified();
  4961. for (int il = 0; il < n_layer; ++il) {
  4962. ggml_tensor * inpSA = inpL;
  4963. cur = build_norm(inpL,
  4964. model.layers[il].attn_norm, NULL,
  4965. LLM_NORM_RMS, il);
  4966. cb(cur, "attn_norm", il);
  4967. // self-attention
  4968. {
  4969. cur = build_lora_mm(model.layers[il].wqkv, cur);
  4970. cb(cur, "wqkv", il);
  4971. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  4972. cb(cur, "bqkv", il);
  4973. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4974. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4975. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
  4976. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4977. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4978. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4979. // using mode = 2 for neox mode
  4980. Qcur = ggml_rope_ext(
  4981. ctx0, Qcur, inp_pos, nullptr,
  4982. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4983. ext_factor, attn_factor, beta_fast, beta_slow
  4984. );
  4985. Kcur = ggml_rope_ext(
  4986. ctx0, Kcur, inp_pos, nullptr,
  4987. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4988. ext_factor, attn_factor, beta_fast, beta_slow
  4989. );
  4990. cb(Qcur, "Qcur", il);
  4991. cb(Kcur, "Kcur", il);
  4992. cb(Vcur, "Vcur", il);
  4993. cur = build_attn(inp_attn, gf,
  4994. model.layers[il].wo, NULL,
  4995. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  4996. }
  4997. if (il == n_layer - 1) {
  4998. // skip computing output for unused tokens
  4999. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5000. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5001. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5002. }
  5003. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5004. cb(ffn_inp, "ffn_inp", il);
  5005. // feed-forward forward
  5006. {
  5007. cur = build_norm(ffn_inp,
  5008. model.layers[il].ffn_norm, NULL,
  5009. LLM_NORM_RMS, il);
  5010. cb(cur, "ffn_norm", il);
  5011. cur = build_ffn(cur,
  5012. model.layers[il].ffn_up, NULL, NULL,
  5013. model.layers[il].ffn_gate, NULL, NULL,
  5014. model.layers[il].ffn_down, NULL, NULL,
  5015. NULL,
  5016. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5017. cb(cur, "ffn_out", il);
  5018. }
  5019. cur = ggml_add(ctx0, cur, ffn_inp);
  5020. cur = build_cvec(cur, il);
  5021. cb(cur, "l_out", il);
  5022. // input for next layer
  5023. inpL = cur;
  5024. }
  5025. cur = inpL;
  5026. cur = build_norm(cur,
  5027. model.output_norm, NULL,
  5028. LLM_NORM_RMS, -1);
  5029. cb(cur, "result_norm", -1);
  5030. res->t_embd = cur;
  5031. // lm_head
  5032. cur = build_lora_mm(model.output, cur);
  5033. cb(cur, "result_output", -1);
  5034. res->t_logits = cur;
  5035. ggml_build_forward_expand(gf, cur);
  5036. }
  5037. };
  5038. struct llm_build_qwen2 : public llm_graph_context {
  5039. llm_build_qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  5040. const int64_t n_embd_head = hparams.n_embd_head_v;
  5041. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5042. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5043. ggml_tensor * cur;
  5044. ggml_tensor * inpL;
  5045. inpL = build_inp_embd(model.tok_embd);
  5046. // inp_pos - contains the positions
  5047. ggml_tensor * inp_pos = build_inp_pos();
  5048. auto * inp_attn = build_attn_inp_kv_unified();
  5049. for (int il = 0; il < n_layer; ++il) {
  5050. ggml_tensor * inpSA = inpL;
  5051. // norm
  5052. cur = build_norm(inpL,
  5053. model.layers[il].attn_norm, NULL,
  5054. LLM_NORM_RMS, il);
  5055. cb(cur, "attn_norm", il);
  5056. // self-attention
  5057. {
  5058. // compute Q and K and RoPE them
  5059. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5060. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  5061. cb(Qcur, "Qcur", il);
  5062. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5063. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  5064. cb(Kcur, "Kcur", il);
  5065. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5066. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  5067. cb(Vcur, "Vcur", il);
  5068. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5069. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5070. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5071. Qcur = ggml_rope_ext(
  5072. ctx0, Qcur, inp_pos, nullptr,
  5073. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5074. ext_factor, attn_factor, beta_fast, beta_slow
  5075. );
  5076. Kcur = ggml_rope_ext(
  5077. ctx0, Kcur, inp_pos, nullptr,
  5078. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5079. ext_factor, attn_factor, beta_fast, beta_slow
  5080. );
  5081. cb(Qcur, "Qcur", il);
  5082. cb(Kcur, "Kcur", il);
  5083. cb(Vcur, "Vcur", il);
  5084. cur = build_attn(inp_attn, gf,
  5085. model.layers[il].wo, model.layers[il].bo,
  5086. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5087. }
  5088. if (il == n_layer - 1) {
  5089. // skip computing output for unused tokens
  5090. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5091. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5092. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5093. }
  5094. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5095. cb(ffn_inp, "ffn_inp", il);
  5096. // feed-forward network
  5097. cur = build_norm(ffn_inp,
  5098. model.layers[il].ffn_norm, NULL,
  5099. LLM_NORM_RMS, il);
  5100. cb(cur, "ffn_norm", il);
  5101. cur = build_ffn(cur,
  5102. model.layers[il].ffn_up, NULL, NULL,
  5103. model.layers[il].ffn_gate, NULL, NULL,
  5104. model.layers[il].ffn_down, NULL, NULL,
  5105. NULL,
  5106. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5107. cb(cur, "ffn_out", il);
  5108. cur = ggml_add(ctx0, cur, ffn_inp);
  5109. cur = build_cvec(cur, il);
  5110. cb(cur, "l_out", il);
  5111. // input for next layer
  5112. inpL = cur;
  5113. }
  5114. cur = inpL;
  5115. cur = build_norm(cur,
  5116. model.output_norm, NULL,
  5117. LLM_NORM_RMS, -1);
  5118. cb(cur, "result_norm", -1);
  5119. res->t_embd = cur;
  5120. // lm_head
  5121. cur = build_lora_mm(model.output, cur);
  5122. cb(cur, "result_output", -1);
  5123. res->t_logits = cur;
  5124. ggml_build_forward_expand(gf, cur);
  5125. }
  5126. };
  5127. struct llm_build_qwen2vl : public llm_graph_context {
  5128. llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  5129. const int64_t n_embd_head = hparams.n_embd_head_v;
  5130. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5131. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5132. ggml_tensor * cur;
  5133. ggml_tensor * inpL;
  5134. inpL = build_inp_embd(model.tok_embd);
  5135. // inp_pos - contains the positions
  5136. ggml_tensor * inp_pos = build_inp_pos();
  5137. auto * inp_attn = build_attn_inp_kv_unified();
  5138. int sections[4];
  5139. std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
  5140. for (int il = 0; il < n_layer; ++il) {
  5141. ggml_tensor * inpSA = inpL;
  5142. // norm
  5143. cur = build_norm(inpL,
  5144. model.layers[il].attn_norm, NULL,
  5145. LLM_NORM_RMS, il);
  5146. cb(cur, "attn_norm", il);
  5147. // self-attention
  5148. {
  5149. // compute Q and K and RoPE them
  5150. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5151. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  5152. cb(Qcur, "Qcur", il);
  5153. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5154. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  5155. cb(Kcur, "Kcur", il);
  5156. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5157. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  5158. cb(Vcur, "Vcur", il);
  5159. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5160. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5161. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5162. Qcur = ggml_rope_multi(
  5163. ctx0, Qcur, inp_pos, nullptr,
  5164. n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
  5165. ext_factor, attn_factor, beta_fast, beta_slow
  5166. );
  5167. Kcur = ggml_rope_multi(
  5168. ctx0, Kcur, inp_pos, nullptr,
  5169. n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
  5170. ext_factor, attn_factor, beta_fast, beta_slow
  5171. );
  5172. cb(Qcur, "Qcur", il);
  5173. cb(Kcur, "Kcur", il);
  5174. cb(Vcur, "Vcur", il);
  5175. cur = build_attn(inp_attn, gf,
  5176. model.layers[il].wo, model.layers[il].bo,
  5177. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5178. }
  5179. if (il == n_layer - 1) {
  5180. // skip computing output for unused tokens
  5181. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5182. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5183. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5184. }
  5185. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5186. cb(ffn_inp, "ffn_inp", il);
  5187. // feed-forward network
  5188. cur = build_norm(ffn_inp,
  5189. model.layers[il].ffn_norm, NULL,
  5190. LLM_NORM_RMS, il);
  5191. cb(cur, "ffn_norm", il);
  5192. cur = build_ffn(cur,
  5193. model.layers[il].ffn_up, NULL, NULL,
  5194. model.layers[il].ffn_gate, NULL, NULL,
  5195. model.layers[il].ffn_down, NULL, NULL,
  5196. NULL,
  5197. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5198. cb(cur, "ffn_out", il);
  5199. cur = ggml_add(ctx0, cur, ffn_inp);
  5200. cur = build_cvec(cur, il);
  5201. cb(cur, "l_out", il);
  5202. // input for next layer
  5203. inpL = cur;
  5204. }
  5205. cur = inpL;
  5206. cur = build_norm(cur,
  5207. model.output_norm, NULL,
  5208. LLM_NORM_RMS, -1);
  5209. cb(cur, "result_norm", -1);
  5210. res->t_embd = cur;
  5211. // lm_head
  5212. cur = build_lora_mm(model.output, cur);
  5213. cb(cur, "result_output", -1);
  5214. res->t_logits = cur;
  5215. ggml_build_forward_expand(gf, cur);
  5216. }
  5217. };
  5218. struct llm_build_qwen2moe : public llm_graph_context {
  5219. llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  5220. const int64_t n_embd_head = hparams.n_embd_head_v;
  5221. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5222. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5223. ggml_tensor * cur;
  5224. ggml_tensor * inpL;
  5225. inpL = build_inp_embd(model.tok_embd);
  5226. // inp_pos - contains the positions
  5227. ggml_tensor * inp_pos = build_inp_pos();
  5228. auto * inp_attn = build_attn_inp_kv_unified();
  5229. for (int il = 0; il < n_layer; ++il) {
  5230. ggml_tensor * inpSA = inpL;
  5231. // norm
  5232. cur = build_norm(inpL,
  5233. model.layers[il].attn_norm, NULL,
  5234. LLM_NORM_RMS, il);
  5235. cb(cur, "attn_norm", il);
  5236. // self_attention
  5237. {
  5238. // compute Q and K and RoPE them
  5239. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5240. cb(Qcur, "Qcur", il);
  5241. if (model.layers[il].bq) {
  5242. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  5243. cb(Qcur, "Qcur", il);
  5244. }
  5245. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5246. cb(Kcur, "Kcur", il);
  5247. if (model.layers[il].bk) {
  5248. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  5249. cb(Kcur, "Kcur", il);
  5250. }
  5251. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5252. cb(Vcur, "Vcur", il);
  5253. if (model.layers[il].bv) {
  5254. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  5255. cb(Vcur, "Vcur", il);
  5256. }
  5257. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5258. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5259. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5260. Qcur = ggml_rope_ext(
  5261. ctx0, Qcur, inp_pos, nullptr,
  5262. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5263. ext_factor, attn_factor, beta_fast, beta_slow
  5264. );
  5265. Kcur = ggml_rope_ext(
  5266. ctx0, Kcur, inp_pos, nullptr,
  5267. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5268. ext_factor, attn_factor, beta_fast, beta_slow
  5269. );
  5270. cb(Qcur, "Qcur", il);
  5271. cb(Kcur, "Kcur", il);
  5272. cb(Vcur, "Vcur", il);
  5273. cur = build_attn(inp_attn, gf,
  5274. model.layers[il].wo, model.layers[il].bo,
  5275. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5276. }
  5277. if (il == n_layer - 1) {
  5278. // skip computing output for unused tokens
  5279. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5280. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5281. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5282. }
  5283. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5284. cb(ffn_inp, "ffn_inp", il);
  5285. // MoE branch
  5286. cur = build_norm(ffn_inp,
  5287. model.layers[il].ffn_norm, NULL,
  5288. LLM_NORM_RMS, il);
  5289. cb(cur, "ffn_norm", il);
  5290. ggml_tensor * moe_out =
  5291. build_moe_ffn(cur,
  5292. model.layers[il].ffn_gate_inp,
  5293. model.layers[il].ffn_up_exps,
  5294. model.layers[il].ffn_gate_exps,
  5295. model.layers[il].ffn_down_exps,
  5296. nullptr,
  5297. n_expert, n_expert_used,
  5298. LLM_FFN_SILU, false,
  5299. false, 0.0,
  5300. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  5301. il);
  5302. cb(moe_out, "ffn_moe_out", il);
  5303. // FFN shared expert
  5304. {
  5305. ggml_tensor * cur_gate_inp = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur);
  5306. cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
  5307. // sigmoid
  5308. ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
  5309. cb(cur_gate, "ffn_shexp_gate", il);
  5310. ggml_tensor * cur_ffn = build_ffn(cur,
  5311. model.layers[il].ffn_up_shexp, NULL, NULL,
  5312. model.layers[il].ffn_gate_shexp, NULL, NULL,
  5313. model.layers[il].ffn_down_shexp, NULL, NULL,
  5314. NULL,
  5315. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5316. cb(cur_ffn, "ffn_shexp", il);
  5317. ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
  5318. cb(ffn_shexp_out, "ffn_shexp_out", il);
  5319. moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
  5320. cb(moe_out, "ffn_out", il);
  5321. cur = moe_out;
  5322. }
  5323. cur = ggml_add(ctx0, cur, ffn_inp);
  5324. cur = build_cvec(cur, il);
  5325. cb(cur, "l_out", il);
  5326. // input for next layer
  5327. inpL = cur;
  5328. }
  5329. cur = inpL;
  5330. cur = build_norm(cur,
  5331. model.output_norm, NULL,
  5332. LLM_NORM_RMS, -1);
  5333. cb(cur, "result_norm", -1);
  5334. res->t_embd = cur;
  5335. // lm_head
  5336. cur = build_lora_mm(model.output, cur);
  5337. cb(cur, "result_output", -1);
  5338. res->t_logits = cur;
  5339. ggml_build_forward_expand(gf, cur);
  5340. }
  5341. };
  5342. struct llm_build_phi2 : public llm_graph_context {
  5343. llm_build_phi2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  5344. const int64_t n_embd_head = hparams.n_embd_head_v;
  5345. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  5346. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5347. ggml_tensor * cur;
  5348. ggml_tensor * attn_norm_output;
  5349. ggml_tensor * ffn_output;
  5350. ggml_tensor * inpL;
  5351. inpL = build_inp_embd(model.tok_embd);
  5352. // inp_pos - contains the positions
  5353. ggml_tensor * inp_pos = build_inp_pos();
  5354. auto * inp_attn = build_attn_inp_kv_unified();
  5355. for (int il = 0; il < n_layer; ++il) {
  5356. attn_norm_output = build_norm(inpL,
  5357. model.layers[il].attn_norm,
  5358. model.layers[il].attn_norm_b,
  5359. LLM_NORM, il);
  5360. cb(attn_norm_output, "attn_norm", il);
  5361. // self-attention
  5362. {
  5363. ggml_tensor * Qcur = nullptr;
  5364. ggml_tensor * Kcur = nullptr;
  5365. ggml_tensor * Vcur = nullptr;
  5366. if (model.layers[il].wqkv) {
  5367. cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
  5368. cb(cur, "wqkv", il);
  5369. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  5370. cb(cur, "bqkv", il);
  5371. Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  5372. Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  5373. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  5374. } else {
  5375. Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
  5376. Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
  5377. Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
  5378. }
  5379. cb(Qcur, "Qcur", il);
  5380. cb(Kcur, "Kcur", il);
  5381. cb(Vcur, "Vcur", il);
  5382. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5383. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5384. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5385. Qcur = ggml_rope_ext(
  5386. ctx0, Qcur, inp_pos, nullptr,
  5387. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5388. ext_factor, attn_factor, beta_fast, beta_slow
  5389. );
  5390. Kcur = ggml_rope_ext(
  5391. ctx0, Kcur, inp_pos, nullptr,
  5392. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5393. ext_factor, attn_factor, beta_fast, beta_slow
  5394. );
  5395. cb(Qcur, "Qcur", il);
  5396. cb(Kcur, "Kcur", il);
  5397. cb(Vcur, "Vcur", il);
  5398. // with phi2, we scale the Q to avoid precision issues
  5399. // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
  5400. Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
  5401. cur = build_attn(inp_attn, gf,
  5402. model.layers[il].wo, model.layers[il].bo,
  5403. Qcur, Kcur, Vcur, nullptr, 1.0f, il);
  5404. }
  5405. if (il == n_layer - 1) {
  5406. // skip computing output for unused tokens
  5407. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5408. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5409. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  5410. attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
  5411. }
  5412. // FF
  5413. {
  5414. ffn_output = build_ffn(attn_norm_output,
  5415. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  5416. NULL, NULL, NULL,
  5417. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  5418. NULL,
  5419. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  5420. cb(ffn_output, "ffn_out", il);
  5421. }
  5422. cur = ggml_add(ctx0, cur, ffn_output);
  5423. cur = ggml_add(ctx0, cur, inpL);
  5424. cur = build_cvec(cur, il);
  5425. cb(cur, "l_out", il);
  5426. // input for next layer
  5427. inpL = cur;
  5428. }
  5429. cur = build_norm(inpL,
  5430. model.output_norm,
  5431. model.output_norm_b,
  5432. LLM_NORM, -1);
  5433. cb(cur, "result_norm", -1);
  5434. res->t_embd = cur;
  5435. cur = build_lora_mm(model.output, cur);
  5436. cb(cur, "result_output_no_bias", -1);
  5437. cur = ggml_add(ctx0, cur, model.output_b);
  5438. cb(cur, "result_output", -1);
  5439. res->t_logits = cur;
  5440. ggml_build_forward_expand(gf, cur);
  5441. }
  5442. };
  5443. struct llm_build_phi3 : public llm_graph_context {
  5444. llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  5445. const int64_t n_embd_head = hparams.n_embd_head_v;
  5446. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  5447. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5448. ggml_tensor * cur;
  5449. ggml_tensor * inpL;
  5450. inpL = build_inp_embd(model.tok_embd);
  5451. // inp_pos - contains the positions
  5452. ggml_tensor * inp_pos = build_inp_pos();
  5453. auto * inp_attn = build_attn_inp_kv_unified();
  5454. for (int il = 0; il < n_layer; ++il) {
  5455. auto * residual = inpL;
  5456. // self-attention
  5457. {
  5458. // rope freq factors for 128k context
  5459. ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
  5460. ggml_tensor* attn_norm_output = build_norm(inpL,
  5461. model.layers[il].attn_norm,
  5462. model.layers[il].attn_norm_b,
  5463. LLM_NORM_RMS, il);
  5464. cb(attn_norm_output, "attn_norm", il);
  5465. ggml_tensor * Qcur = nullptr;
  5466. ggml_tensor * Kcur = nullptr;
  5467. ggml_tensor * Vcur = nullptr;
  5468. if (model.layers[il].wqkv) {
  5469. cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
  5470. cb(cur, "wqkv", il);
  5471. Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
  5472. Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
  5473. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
  5474. } else {
  5475. Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
  5476. Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
  5477. Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
  5478. }
  5479. cb(Qcur, "Qcur", il);
  5480. cb(Kcur, "Kcur", il);
  5481. cb(Vcur, "Vcur", il);
  5482. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5483. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5484. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5485. Qcur = ggml_rope_ext(
  5486. ctx0, Qcur, inp_pos, rope_factors,
  5487. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5488. ext_factor, attn_factor, beta_fast, beta_slow
  5489. );
  5490. Kcur = ggml_rope_ext(
  5491. ctx0, Kcur, inp_pos, rope_factors,
  5492. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5493. ext_factor, attn_factor, beta_fast, beta_slow
  5494. );
  5495. cb(Qcur, "Qcur", il);
  5496. cb(Kcur, "Kcur", il);
  5497. cb(Vcur, "Vcur", il);
  5498. Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
  5499. cb(Qcur, "Qcur", il);
  5500. cur = build_attn(inp_attn, gf,
  5501. model.layers[il].wo, model.layers[il].bo,
  5502. Qcur, Kcur, Vcur, nullptr, 1.0f, il);
  5503. }
  5504. if (il == n_layer - 1) {
  5505. // skip computing output for unused tokens
  5506. ggml_tensor* inp_out_ids = build_inp_out_ids();
  5507. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5508. residual = ggml_get_rows(ctx0, residual, inp_out_ids);
  5509. }
  5510. cur = ggml_add(ctx0, cur, residual);
  5511. residual = cur;
  5512. cur = build_norm(cur,
  5513. model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
  5514. LLM_NORM_RMS, il);
  5515. cb(cur, "ffn_norm", il);
  5516. // feed-forward network
  5517. if (model.layers[il].ffn_gate_inp == nullptr) {
  5518. cur = build_ffn(cur,
  5519. model.layers[il].ffn_up, NULL, NULL,
  5520. NULL, NULL, NULL,
  5521. model.layers[il].ffn_down, NULL, NULL,
  5522. NULL,
  5523. LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
  5524. cb(cur, "ffn_out", il);
  5525. } else {
  5526. // MoE branch
  5527. cur = build_moe_ffn(cur,
  5528. model.layers[il].ffn_gate_inp,
  5529. model.layers[il].ffn_up_exps,
  5530. model.layers[il].ffn_gate_exps,
  5531. model.layers[il].ffn_down_exps,
  5532. nullptr,
  5533. n_expert, n_expert_used,
  5534. LLM_FFN_SILU, true,
  5535. false, 0.0,
  5536. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  5537. il);
  5538. cb(cur, "ffn_moe_out", il);
  5539. }
  5540. cur = ggml_add(ctx0, residual, cur);
  5541. cur = build_cvec(cur, il);
  5542. cb(cur, "l_out", il);
  5543. // input for next layer
  5544. inpL = cur;
  5545. }
  5546. cur = build_norm(inpL,
  5547. model.output_norm,
  5548. model.output_norm_b,
  5549. LLM_NORM_RMS, -1);
  5550. cb(cur, "result_norm", -1);
  5551. res->t_embd = cur;
  5552. cur = build_lora_mm(model.output, cur);
  5553. if (model.output_b != nullptr) {
  5554. cb(cur, "result_output_no_bias", -1);
  5555. cur = ggml_add(ctx0, cur, model.output_b);
  5556. }
  5557. cb(cur, "result_output", -1);
  5558. res->t_logits = cur;
  5559. ggml_build_forward_expand(gf, cur);
  5560. }
  5561. };
  5562. struct llm_build_plamo : public llm_graph_context {
  5563. llm_build_plamo(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  5564. const int64_t n_embd_head = hparams.n_embd_head_v;
  5565. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5566. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5567. ggml_tensor * cur;
  5568. ggml_tensor * inpL;
  5569. inpL = build_inp_embd(model.tok_embd);
  5570. // inp_pos - contains the positions
  5571. ggml_tensor * inp_pos = build_inp_pos();
  5572. auto * inp_attn = build_attn_inp_kv_unified();
  5573. for (int il = 0; il < n_layer; ++il) {
  5574. // norm
  5575. cur = build_norm(inpL,
  5576. model.layers[il].attn_norm, NULL,
  5577. LLM_NORM_RMS, il);
  5578. cb(cur, "attn_norm", il);
  5579. ggml_tensor * attention_norm = cur;
  5580. // self-attention
  5581. {
  5582. // compute Q and K and RoPE them
  5583. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5584. cb(Qcur, "Qcur", il);
  5585. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5586. cb(Kcur, "Kcur", il);
  5587. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5588. cb(Vcur, "Vcur", il);
  5589. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5590. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5591. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5592. Qcur = ggml_rope_ext(
  5593. ctx0, Qcur, inp_pos, nullptr,
  5594. n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
  5595. ext_factor, attn_factor, beta_fast, beta_slow
  5596. );
  5597. Kcur = ggml_rope_ext(
  5598. ctx0, Kcur, inp_pos, nullptr,
  5599. n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
  5600. ext_factor, attn_factor, beta_fast, beta_slow
  5601. );
  5602. cb(Qcur, "Qcur", il);
  5603. cb(Kcur, "Kcur", il);
  5604. cb(Vcur, "Vcur", il);
  5605. cur = build_attn(inp_attn, gf,
  5606. model.layers[il].wo, NULL,
  5607. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5608. }
  5609. ggml_tensor * sa_out = cur;
  5610. cur = attention_norm;
  5611. if (il == n_layer - 1) {
  5612. // skip computing output for unused tokens
  5613. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5614. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5615. sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
  5616. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  5617. }
  5618. // feed-forward network
  5619. {
  5620. cur = build_ffn(cur,
  5621. model.layers[il].ffn_up, NULL, NULL,
  5622. model.layers[il].ffn_gate, NULL, NULL,
  5623. model.layers[il].ffn_down, NULL, NULL,
  5624. NULL,
  5625. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5626. cb(cur, "ffn_out", il);
  5627. }
  5628. cur = ggml_add(ctx0, cur, sa_out);
  5629. cur = ggml_add(ctx0, cur, inpL);
  5630. cur = build_cvec(cur, il);
  5631. cb(cur, "l_out", il);
  5632. // input for next layer
  5633. inpL = cur;
  5634. }
  5635. cur = inpL;
  5636. cur = build_norm(cur,
  5637. model.output_norm, NULL,
  5638. LLM_NORM_RMS, -1);
  5639. cb(cur, "result_norm", -1);
  5640. res->t_embd = cur;
  5641. // lm_head
  5642. cur = build_lora_mm(model.output, cur);
  5643. cb(cur, "result_output", -1);
  5644. res->t_logits = cur;
  5645. ggml_build_forward_expand(gf, cur);
  5646. }
  5647. };
  5648. struct llm_build_gpt2 : public llm_graph_context {
  5649. llm_build_gpt2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  5650. const int64_t n_embd_head = hparams.n_embd_head_v;
  5651. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  5652. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5653. ggml_tensor * cur;
  5654. ggml_tensor * pos;
  5655. ggml_tensor * inpL;
  5656. inpL = build_inp_embd(model.tok_embd);
  5657. // inp_pos - contains the positions
  5658. ggml_tensor * inp_pos = build_inp_pos();
  5659. auto * inp_attn = build_attn_inp_kv_unified();
  5660. pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
  5661. cb(pos, "pos_embd", -1);
  5662. inpL = ggml_add(ctx0, inpL, pos);
  5663. cb(inpL, "inpL", -1);
  5664. for (int il = 0; il < n_layer; ++il) {
  5665. cur = build_norm(inpL,
  5666. model.layers[il].attn_norm,
  5667. model.layers[il].attn_norm_b,
  5668. LLM_NORM, il);
  5669. cb(cur, "attn_norm", il);
  5670. // self-attention
  5671. {
  5672. cur = build_lora_mm(model.layers[il].wqkv, cur);
  5673. cb(cur, "wqkv", il);
  5674. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  5675. cb(cur, "bqkv", il);
  5676. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  5677. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  5678. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  5679. cb(Qcur, "Qcur", il);
  5680. cb(Kcur, "Kcur", il);
  5681. cb(Vcur, "Vcur", il);
  5682. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5683. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5684. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5685. cur = build_attn(inp_attn, gf,
  5686. model.layers[il].wo, model.layers[il].bo,
  5687. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5688. }
  5689. if (il == n_layer - 1) {
  5690. // skip computing output for unused tokens
  5691. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5692. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5693. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  5694. }
  5695. // add the input
  5696. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  5697. cb(ffn_inp, "ffn_inp", il);
  5698. // FF
  5699. {
  5700. cur = build_norm(ffn_inp,
  5701. model.layers[il].ffn_norm,
  5702. model.layers[il].ffn_norm_b,
  5703. LLM_NORM, il);
  5704. cb(cur, "ffn_norm", il);
  5705. cur = build_ffn(cur,
  5706. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  5707. NULL, NULL, NULL,
  5708. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  5709. NULL,
  5710. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  5711. cb(cur, "ffn_out", il);
  5712. }
  5713. cur = ggml_add(ctx0, cur, ffn_inp);
  5714. cur = build_cvec(cur, il);
  5715. cb(cur, "l_out", il);
  5716. // input for next layer
  5717. inpL = cur;
  5718. }
  5719. cur = build_norm(inpL,
  5720. model.output_norm,
  5721. model.output_norm_b,
  5722. LLM_NORM, -1);
  5723. cb(cur, "result_norm", -1);
  5724. res->t_embd = cur;
  5725. cur = build_lora_mm(model.output, cur);
  5726. cb(cur, "result_output", -1);
  5727. res->t_logits = cur;
  5728. ggml_build_forward_expand(gf, cur);
  5729. }
  5730. };
  5731. struct llm_build_codeshell : public llm_graph_context {
  5732. llm_build_codeshell(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  5733. const int64_t n_embd_head = hparams.n_embd_head_v;
  5734. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  5735. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5736. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5737. ggml_tensor * cur;
  5738. ggml_tensor * inpL;
  5739. inpL = build_inp_embd(model.tok_embd);
  5740. // inp_pos - contains the positions
  5741. ggml_tensor * inp_pos = build_inp_pos();
  5742. auto * inp_attn = build_attn_inp_kv_unified();
  5743. for (int il = 0; il < n_layer; ++il) {
  5744. cur = build_norm(inpL,
  5745. model.layers[il].attn_norm,
  5746. model.layers[il].attn_norm_b,
  5747. LLM_NORM, il);
  5748. cb(cur, "attn_norm", il);
  5749. // self-attention
  5750. {
  5751. cur = build_lora_mm(model.layers[il].wqkv, cur);
  5752. cb(cur, "wqkv", il);
  5753. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  5754. cb(cur, "bqkv", il);
  5755. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  5756. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  5757. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  5758. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5759. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5760. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5761. Qcur = ggml_rope_ext(
  5762. ctx0, Qcur, inp_pos, nullptr,
  5763. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5764. ext_factor, attn_factor, beta_fast, beta_slow
  5765. );
  5766. Kcur = ggml_rope_ext(
  5767. ctx0, Kcur, inp_pos, nullptr,
  5768. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5769. ext_factor, attn_factor, beta_fast, beta_slow
  5770. );
  5771. cb(Qcur, "Qcur", il);
  5772. cb(Kcur, "Kcur", il);
  5773. cb(Vcur, "Vcur", il);
  5774. cur = build_attn(inp_attn, gf,
  5775. model.layers[il].wo, model.layers[il].bo,
  5776. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5777. }
  5778. if (il == n_layer - 1) {
  5779. // skip computing output for unused tokens
  5780. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5781. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5782. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  5783. }
  5784. // add the input
  5785. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  5786. cb(ffn_inp, "ffn_inp", il);
  5787. // FF
  5788. {
  5789. cur = build_norm(ffn_inp,
  5790. model.layers[il].ffn_norm,
  5791. model.layers[il].ffn_norm_b,
  5792. LLM_NORM, il);
  5793. cb(cur, "ffn_norm", il);
  5794. cur = build_ffn(cur,
  5795. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  5796. NULL, NULL, NULL,
  5797. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  5798. NULL,
  5799. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  5800. cb(cur, "ffn_out", il);
  5801. }
  5802. cur = ggml_add(ctx0, cur, ffn_inp);
  5803. cur = build_cvec(cur, il);
  5804. cb(cur, "l_out", il);
  5805. // input for next layer
  5806. inpL = cur;
  5807. }
  5808. cur = build_norm(inpL,
  5809. model.output_norm,
  5810. model.output_norm_b,
  5811. LLM_NORM, -1);
  5812. cb(cur, "result_norm", -1);
  5813. res->t_embd = cur;
  5814. cur = build_lora_mm(model.output, cur);
  5815. cb(cur, "result_output", -1);
  5816. res->t_logits = cur;
  5817. ggml_build_forward_expand(gf, cur);
  5818. }
  5819. };
  5820. struct llm_build_orion : public llm_graph_context {
  5821. llm_build_orion(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  5822. const int64_t n_embd_head = hparams.n_embd_head_v;
  5823. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5824. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5825. ggml_tensor * cur;
  5826. ggml_tensor * inpL;
  5827. inpL = build_inp_embd(model.tok_embd);
  5828. // inp_pos - contains the positions
  5829. ggml_tensor * inp_pos = build_inp_pos();
  5830. auto * inp_attn = build_attn_inp_kv_unified();
  5831. for (int il = 0; il < n_layer; ++il) {
  5832. ggml_tensor * inpSA = inpL;
  5833. // norm
  5834. cur = build_norm(inpL,
  5835. model.layers[il].attn_norm, model.layers[il].attn_norm_b,
  5836. LLM_NORM, il);
  5837. cb(cur, "attn_norm", il);
  5838. // self-attention
  5839. {
  5840. // compute Q and K and RoPE them
  5841. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5842. cb(Qcur, "Qcur", il);
  5843. // if (model.layers[il].bq) {
  5844. // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  5845. // cb(Qcur, "Qcur", il);
  5846. // }
  5847. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5848. cb(Kcur, "Kcur", il);
  5849. // if (model.layers[il].bk) {
  5850. // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  5851. // cb(Kcur, "Kcur", il);
  5852. // }
  5853. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5854. cb(Vcur, "Vcur", il);
  5855. // if (model.layers[il].bv) {
  5856. // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  5857. // cb(Vcur, "Vcur", il);
  5858. // }
  5859. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5860. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5861. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5862. Qcur = ggml_rope_ext(
  5863. ctx0, Qcur, inp_pos, nullptr,
  5864. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5865. ext_factor, attn_factor, beta_fast, beta_slow
  5866. );
  5867. Kcur = ggml_rope_ext(
  5868. ctx0, Kcur, inp_pos, nullptr,
  5869. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5870. ext_factor, attn_factor, beta_fast, beta_slow
  5871. );
  5872. cb(Qcur, "Qcur", il);
  5873. cb(Kcur, "Kcur", il);
  5874. cb(Vcur, "Vcur", il);
  5875. cur = build_attn(inp_attn, gf,
  5876. model.layers[il].wo, NULL,
  5877. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5878. }
  5879. if (il == n_layer - 1) {
  5880. // skip computing output for unused tokens
  5881. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5882. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5883. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5884. }
  5885. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5886. cb(ffn_inp, "ffn_inp", il);
  5887. // feed-forward network
  5888. cur = build_norm(ffn_inp,
  5889. model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
  5890. LLM_NORM, il);
  5891. cb(cur, "ffn_norm", il);
  5892. cur = build_ffn(cur,
  5893. model.layers[il].ffn_up, NULL, NULL,
  5894. model.layers[il].ffn_gate, NULL, NULL,
  5895. model.layers[il].ffn_down, NULL, NULL,
  5896. NULL,
  5897. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5898. cb(cur, "ffn_out", il);
  5899. cur = ggml_add(ctx0, cur, ffn_inp);
  5900. cur = build_cvec(cur, il);
  5901. cb(cur, "l_out", il);
  5902. // input for next layer
  5903. inpL = cur;
  5904. }
  5905. cur = inpL;
  5906. cur = build_norm(cur,
  5907. model.output_norm, model.output_norm_b,
  5908. LLM_NORM, -1);
  5909. cb(cur, "result_norm", -1);
  5910. res->t_embd = cur;
  5911. // lm_head
  5912. cur = build_lora_mm(model.output, cur);
  5913. cb(cur, "result_output", -1);
  5914. res->t_logits = cur;
  5915. ggml_build_forward_expand(gf, cur);
  5916. }
  5917. };
  5918. struct llm_build_internlm2 : public llm_graph_context {
  5919. llm_build_internlm2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  5920. const int64_t n_embd_head = hparams.n_embd_head_v;
  5921. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5922. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5923. ggml_tensor * cur;
  5924. ggml_tensor * inpL;
  5925. inpL = build_inp_embd(model.tok_embd);
  5926. // inp_pos - contains the positions
  5927. ggml_tensor * inp_pos = build_inp_pos();
  5928. auto * inp_attn = build_attn_inp_kv_unified();
  5929. for (int il = 0; il < n_layer; ++il) {
  5930. ggml_tensor * inpSA = inpL;
  5931. // norm
  5932. cur = build_norm(inpL,
  5933. model.layers[il].attn_norm, NULL,
  5934. LLM_NORM_RMS, il);
  5935. cb(cur, "attn_norm", il);
  5936. // self-attention
  5937. {
  5938. // compute Q and K and RoPE them
  5939. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5940. cb(Qcur, "Qcur", il);
  5941. if (model.layers[il].bq) {
  5942. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  5943. cb(Qcur, "Qcur", il);
  5944. }
  5945. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5946. cb(Kcur, "Kcur", il);
  5947. if (model.layers[il].bk) {
  5948. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  5949. cb(Kcur, "Kcur", il);
  5950. }
  5951. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5952. cb(Vcur, "Vcur", il);
  5953. if (model.layers[il].bv) {
  5954. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  5955. cb(Vcur, "Vcur", il);
  5956. }
  5957. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5958. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5959. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5960. Qcur = ggml_rope_ext(
  5961. ctx0, Qcur, inp_pos, nullptr,
  5962. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5963. ext_factor, attn_factor, beta_fast, beta_slow
  5964. );
  5965. Kcur = ggml_rope_ext(
  5966. ctx0, Kcur, inp_pos, nullptr,
  5967. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5968. ext_factor, attn_factor, beta_fast, beta_slow
  5969. );
  5970. cb(Qcur, "Qcur", il);
  5971. cb(Kcur, "Kcur", il);
  5972. cb(Vcur, "Vcur", il);
  5973. cur = build_attn(inp_attn, gf,
  5974. model.layers[il].wo, model.layers[il].bo,
  5975. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5976. }
  5977. if (il == n_layer - 1) {
  5978. // skip computing output for unused tokens
  5979. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5980. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5981. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5982. }
  5983. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5984. cb(ffn_inp, "ffn_inp", il);
  5985. // feed-forward network
  5986. cur = build_norm(ffn_inp,
  5987. model.layers[il].ffn_norm, NULL,
  5988. LLM_NORM_RMS, il);
  5989. cb(cur, "ffn_norm", il);
  5990. cur = build_ffn(cur,
  5991. model.layers[il].ffn_up, NULL, NULL,
  5992. model.layers[il].ffn_gate, NULL, NULL,
  5993. model.layers[il].ffn_down, NULL, NULL,
  5994. NULL,
  5995. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5996. cb(cur, "ffn_out", il);
  5997. cur = ggml_add(ctx0, cur, ffn_inp);
  5998. cur = build_cvec(cur, il);
  5999. cb(cur, "l_out", il);
  6000. // input for next layer
  6001. inpL = cur;
  6002. }
  6003. cur = inpL;
  6004. cur = build_norm(cur,
  6005. model.output_norm, NULL,
  6006. LLM_NORM_RMS, -1);
  6007. cb(cur, "result_norm", -1);
  6008. res->t_embd = cur;
  6009. // lm_head
  6010. cur = build_lora_mm(model.output, cur);
  6011. cb(cur, "result_output", -1);
  6012. res->t_logits = cur;
  6013. ggml_build_forward_expand(gf, cur);
  6014. }
  6015. };
  6016. struct llm_build_minicpm3 : public llm_graph_context {
  6017. llm_build_minicpm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  6018. //TODO: if the model varies, these parameters need to be read from the model
  6019. const int64_t n_embd_base = 256;
  6020. const float scale_embd = 12.0f;
  6021. const float scale_depth = 1.4f;
  6022. const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
  6023. const uint32_t n_embd_head_qk_rope = hparams.n_rot;
  6024. const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  6025. const uint32_t kv_lora_rank = hparams.n_lora_kv;
  6026. ggml_tensor * cur;
  6027. ggml_tensor * inpL;
  6028. inpL = build_inp_embd(model.tok_embd);
  6029. // scale the input embeddings
  6030. inpL = ggml_scale(ctx0, inpL, scale_embd);
  6031. cb(inpL, "inp_scaled", -1);
  6032. // inp_pos - contains the positions
  6033. ggml_tensor * inp_pos = build_inp_pos();
  6034. auto * inp_attn = build_attn_inp_kv_unified();
  6035. for (int il = 0; il < n_layer; ++il) {
  6036. ggml_tensor * inpSA = inpL;
  6037. ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
  6038. // norm
  6039. cur = build_norm(inpL,
  6040. model.layers[il].attn_norm, NULL,
  6041. LLM_NORM_RMS, il);
  6042. cb(cur, "attn_norm", il);
  6043. // self_attention
  6044. {
  6045. ggml_tensor * q = NULL;
  6046. // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
  6047. q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
  6048. cb(q, "q", il);
  6049. q = build_norm(q,
  6050. model.layers[il].attn_q_a_norm, NULL,
  6051. LLM_NORM_RMS, il);
  6052. cb(q, "q", il);
  6053. // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
  6054. q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
  6055. cb(q, "q", il);
  6056. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  6057. ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
  6058. ggml_row_size(q->type, hparams.n_embd_head_k),
  6059. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  6060. 0);
  6061. cb(q_nope, "q_nope", il);
  6062. // and {n_head * n_embd_head_qk_rope, n_tokens}
  6063. ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
  6064. ggml_row_size(q->type, hparams.n_embd_head_k),
  6065. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  6066. ggml_row_size(q->type, n_embd_head_qk_nope));
  6067. cb(q_pe, "q_pe", il);
  6068. // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
  6069. ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
  6070. cb(kv_pe_compresseed, "kv_pe_compresseed", il);
  6071. // split into {kv_lora_rank, n_tokens}
  6072. ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
  6073. kv_pe_compresseed->nb[1],
  6074. 0);
  6075. cb(kv_compressed, "kv_compressed", il);
  6076. // and {n_embd_head_qk_rope, n_tokens}
  6077. ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
  6078. kv_pe_compresseed->nb[1],
  6079. kv_pe_compresseed->nb[1],
  6080. ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
  6081. cb(k_pe, "k_pe", il);
  6082. // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
  6083. kv_compressed = ggml_cont(ctx0, kv_compressed);
  6084. kv_compressed = build_norm(kv_compressed,
  6085. model.layers[il].attn_kv_a_norm, NULL,
  6086. LLM_NORM_RMS, il);
  6087. cb(kv_compressed, "kv_compressed", il);
  6088. // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
  6089. ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
  6090. cb(kv, "kv", il);
  6091. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  6092. ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
  6093. ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
  6094. ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  6095. 0);
  6096. cb(k_nope, "k_nope", il);
  6097. // and {n_head * n_embd_head_v, n_tokens}
  6098. ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
  6099. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  6100. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
  6101. ggml_row_size(kv->type, (n_embd_head_qk_nope)));
  6102. cb(v_states, "v_states", il);
  6103. v_states = ggml_cont(ctx0, v_states);
  6104. cb(v_states, "v_states", il);
  6105. v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
  6106. ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
  6107. 0);
  6108. cb(v_states, "v_states", il);
  6109. q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
  6110. q_pe = ggml_rope_ext(
  6111. ctx0, q_pe, inp_pos, rope_factors,
  6112. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6113. ext_factor, attn_factor, beta_fast, beta_slow
  6114. );
  6115. cb(q_pe, "q_pe", il);
  6116. // shared RoPE key
  6117. k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
  6118. k_pe = ggml_rope_ext(
  6119. ctx0, k_pe, inp_pos, rope_factors,
  6120. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6121. ext_factor, attn_factor, beta_fast, beta_slow
  6122. );
  6123. cb(k_pe, "k_pe", il);
  6124. ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
  6125. cb(q_states, "q_states", il);
  6126. ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
  6127. cb(k_states, "k_states", il);
  6128. cur = build_attn(inp_attn, gf,
  6129. model.layers[il].wo, NULL,
  6130. q_states, k_states, v_states, nullptr, kq_scale, il);
  6131. }
  6132. if (il == n_layer - 1) {
  6133. // skip computing output for unused tokens
  6134. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6135. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6136. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6137. }
  6138. // scale_res - scale the hidden states for residual connection
  6139. const float scale_res = scale_depth/sqrtf(float(n_layer));
  6140. cur = ggml_scale(ctx0, cur, scale_res);
  6141. cb(cur, "hidden_scaled", il);
  6142. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6143. cb(ffn_inp, "ffn_inp", il);
  6144. // feed-forward network
  6145. {
  6146. cur = build_norm(ffn_inp,
  6147. model.layers[il].ffn_norm, NULL,
  6148. LLM_NORM_RMS, il);
  6149. cb(cur, "ffn_norm", il);
  6150. cur = build_ffn(cur,
  6151. model.layers[il].ffn_up, NULL, NULL,
  6152. model.layers[il].ffn_gate, NULL, NULL,
  6153. model.layers[il].ffn_down, NULL, NULL,
  6154. NULL,
  6155. LLM_FFN_SILU, LLM_FFN_PAR, il);
  6156. cb(cur, "ffn_out", il);
  6157. }
  6158. // scale the hidden states for residual connection
  6159. cur = ggml_scale(ctx0, cur, scale_res);
  6160. cb(cur, "hidden_scaled_ffn", il);
  6161. cur = ggml_add(ctx0, cur, ffn_inp);
  6162. cur = build_cvec(cur, il);
  6163. cb(cur, "l_out", il);
  6164. // input for next layer
  6165. inpL = cur;
  6166. }
  6167. cur = inpL;
  6168. cur = build_norm(cur,
  6169. model.output_norm, NULL,
  6170. LLM_NORM_RMS, -1);
  6171. cb(cur, "result_norm", -1);
  6172. res->t_embd = cur;
  6173. // lm_head scaling
  6174. const float scale_lmhead = float(n_embd_base)/float(n_embd);
  6175. cur = ggml_scale(ctx0, cur, scale_lmhead);
  6176. cb(cur, "lmhead_scaling", -1);
  6177. // lm_head
  6178. cur = build_lora_mm(model.output, cur);
  6179. cb(cur, "result_output", -1);
  6180. res->t_logits = cur;
  6181. ggml_build_forward_expand(gf, cur);
  6182. }
  6183. };
  6184. struct llm_build_gemma : public llm_graph_context {
  6185. llm_build_gemma(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  6186. const int64_t n_embd_head = hparams.n_embd_head_v;
  6187. ggml_tensor * cur;
  6188. ggml_tensor * inpL;
  6189. inpL = build_inp_embd(model.tok_embd);
  6190. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  6191. cb(inpL, "inp_scaled", -1);
  6192. // inp_pos - contains the positions
  6193. ggml_tensor * inp_pos = build_inp_pos();
  6194. auto * inp_attn = build_attn_inp_kv_unified();
  6195. for (int il = 0; il < n_layer; ++il) {
  6196. // norm
  6197. cur = build_norm(inpL,
  6198. model.layers[il].attn_norm, NULL,
  6199. LLM_NORM_RMS, il);
  6200. cb(cur, "attn_norm", il);
  6201. // self-attention
  6202. {
  6203. // compute Q and K and RoPE them
  6204. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6205. cb(Qcur, "Qcur", il);
  6206. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6207. cb(Kcur, "Kcur", il);
  6208. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6209. cb(Vcur, "Vcur", il);
  6210. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6211. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6212. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6213. Qcur = ggml_rope_ext(
  6214. ctx0, Qcur, inp_pos, nullptr,
  6215. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6216. ext_factor, attn_factor, beta_fast, beta_slow);
  6217. Kcur = ggml_rope_ext(
  6218. ctx0, Kcur, inp_pos, nullptr,
  6219. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6220. ext_factor, attn_factor, beta_fast, beta_slow);
  6221. cb(Qcur, "Qcur", il);
  6222. cb(Kcur, "Kcur", il);
  6223. cb(Vcur, "Vcur", il);
  6224. Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
  6225. cb(Qcur, "Qcur_scaled", il);
  6226. cur = build_attn(inp_attn, gf,
  6227. model.layers[il].wo, NULL,
  6228. Qcur, Kcur, Vcur, nullptr, 1.0f, il);
  6229. }
  6230. if (il == n_layer - 1) {
  6231. // skip computing output for unused tokens
  6232. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6233. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6234. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6235. }
  6236. ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
  6237. cb(sa_out, "sa_out", il);
  6238. cur = build_norm(sa_out,
  6239. model.layers[il].ffn_norm, NULL,
  6240. LLM_NORM_RMS, il);
  6241. cb(cur, "ffn_norm", il);
  6242. // feed-forward network
  6243. {
  6244. cur = build_ffn(cur,
  6245. model.layers[il].ffn_up, NULL, NULL,
  6246. model.layers[il].ffn_gate, NULL, NULL,
  6247. model.layers[il].ffn_down, NULL, NULL,
  6248. NULL,
  6249. LLM_FFN_GELU, LLM_FFN_PAR, il);
  6250. cb(cur, "ffn_out", il);
  6251. }
  6252. cur = ggml_add(ctx0, cur, sa_out);
  6253. cur = build_cvec(cur, il);
  6254. cb(cur, "l_out", il);
  6255. // input for next layer
  6256. inpL = cur;
  6257. }
  6258. cur = inpL;
  6259. cur = build_norm(cur,
  6260. model.output_norm, NULL,
  6261. LLM_NORM_RMS, -1);
  6262. cb(cur, "result_norm", -1);
  6263. res->t_embd = cur;
  6264. // lm_head
  6265. cur = build_lora_mm(model.output, cur);
  6266. cb(cur, "result_output", -1);
  6267. res->t_logits = cur;
  6268. ggml_build_forward_expand(gf, cur);
  6269. }
  6270. };
  6271. struct llm_build_gemma2 : public llm_graph_context {
  6272. llm_build_gemma2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  6273. const int64_t n_embd_head = hparams.n_embd_head_k;
  6274. ggml_tensor * cur;
  6275. ggml_tensor * inpL;
  6276. inpL = build_inp_embd(model.tok_embd);
  6277. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  6278. cb(inpL, "inp_scaled", -1);
  6279. // inp_pos - contains the positions
  6280. ggml_tensor * inp_pos = build_inp_pos();
  6281. auto * inp_attn = build_attn_inp_kv_unified();
  6282. for (int il = 0; il < n_layer; ++il) {
  6283. // norm
  6284. cur = build_norm(inpL,
  6285. model.layers[il].attn_norm, NULL,
  6286. LLM_NORM_RMS, il);
  6287. cb(cur, "attn_norm", il);
  6288. // self-attention
  6289. {
  6290. // compute Q and K and RoPE them
  6291. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6292. cb(Qcur, "Qcur", il);
  6293. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6294. cb(Kcur, "Kcur", il);
  6295. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6296. cb(Vcur, "Vcur", il);
  6297. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6298. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6299. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6300. Qcur = ggml_rope_ext(
  6301. ctx0, Qcur, inp_pos, nullptr,
  6302. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6303. ext_factor, attn_factor, beta_fast, beta_slow);
  6304. Kcur = ggml_rope_ext(
  6305. ctx0, Kcur, inp_pos, nullptr,
  6306. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6307. ext_factor, attn_factor, beta_fast, beta_slow);
  6308. cb(Qcur, "Qcur", il);
  6309. cb(Kcur, "Kcur", il);
  6310. cb(Vcur, "Vcur", il);
  6311. // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
  6312. switch (model.type) {
  6313. case LLM_TYPE_2B:
  6314. case LLM_TYPE_9B:
  6315. case LLM_TYPE_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head))); break;
  6316. default: GGML_ABORT("fatal error");
  6317. };
  6318. cb(Qcur, "Qcur_scaled", il);
  6319. cur = build_attn(inp_attn, gf,
  6320. model.layers[il].wo, NULL,
  6321. Qcur, Kcur, Vcur, nullptr, 1.0f, il);
  6322. }
  6323. cur = build_norm(cur,
  6324. model.layers[il].attn_post_norm, NULL,
  6325. LLM_NORM_RMS, il);
  6326. cb(cur, "attn_post_norm", il);
  6327. if (il == n_layer - 1) {
  6328. // skip computing output for unused tokens
  6329. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6330. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6331. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6332. }
  6333. ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
  6334. cb(sa_out, "sa_out", il);
  6335. cur = build_norm(sa_out,
  6336. model.layers[il].ffn_norm, NULL,
  6337. LLM_NORM_RMS, il);
  6338. cb(cur, "ffn_norm", il);
  6339. // feed-forward network
  6340. {
  6341. cur = build_ffn(cur,
  6342. model.layers[il].ffn_up, NULL, NULL,
  6343. model.layers[il].ffn_gate, NULL, NULL,
  6344. model.layers[il].ffn_down, NULL, NULL,
  6345. NULL,
  6346. LLM_FFN_GELU, LLM_FFN_PAR, il);
  6347. cb(cur, "ffn_out", il);
  6348. }
  6349. cur = build_norm(cur,
  6350. model.layers[il].ffn_post_norm, NULL,
  6351. LLM_NORM_RMS, -1);
  6352. cb(cur, "ffn_post_norm", -1);
  6353. cur = ggml_add(ctx0, cur, sa_out);
  6354. cur = build_cvec(cur, il);
  6355. cb(cur, "l_out", il);
  6356. // input for next layer
  6357. inpL = cur;
  6358. }
  6359. cur = inpL;
  6360. cur = build_norm(cur,
  6361. model.output_norm, NULL,
  6362. LLM_NORM_RMS, -1);
  6363. cb(cur, "result_norm", -1);
  6364. res->t_embd = cur;
  6365. // lm_head
  6366. cur = build_lora_mm(model.output, cur);
  6367. // final logit soft-capping
  6368. cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
  6369. cur = ggml_tanh(ctx0, cur);
  6370. cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
  6371. cb(cur, "result_output", -1);
  6372. res->t_logits = cur;
  6373. ggml_build_forward_expand(gf, cur);
  6374. }
  6375. };
  6376. struct llm_build_gemma3 : public llm_graph_context {
  6377. llm_build_gemma3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  6378. const int64_t n_embd_head = hparams.n_embd_head_k;
  6379. ggml_tensor * cur;
  6380. ggml_tensor * inpL;
  6381. inpL = build_inp_embd(model.tok_embd);
  6382. // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
  6383. if (ubatch.token) {
  6384. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  6385. cb(inpL, "inp_scaled", -1);
  6386. }
  6387. // inp_pos - contains the positions
  6388. ggml_tensor * inp_pos = build_inp_pos();
  6389. // TODO: is causal == true correct? might need some changes
  6390. auto * inp_attn = build_attn_inp_kv_unified();
  6391. for (int il = 0; il < n_layer; ++il) {
  6392. const bool is_swa = hparams.is_swa(il);
  6393. const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
  6394. const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
  6395. // norm
  6396. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  6397. cb(cur, "attn_norm", il);
  6398. // self-attention
  6399. {
  6400. // compute Q and K and RoPE them
  6401. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6402. cb(Qcur, "Qcur", il);
  6403. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6404. cb(Kcur, "Kcur", il);
  6405. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6406. cb(Vcur, "Vcur", il);
  6407. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6408. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6409. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6410. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  6411. cb(Qcur, "Qcur_normed", il);
  6412. Qcur = ggml_rope_ext(
  6413. ctx0, Qcur, inp_pos, nullptr,
  6414. n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
  6415. ext_factor, attn_factor, beta_fast, beta_slow);
  6416. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  6417. cb(Kcur, "Kcur_normed", il);
  6418. Kcur = ggml_rope_ext(
  6419. ctx0, Kcur, inp_pos, nullptr,
  6420. n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
  6421. ext_factor, attn_factor, beta_fast, beta_slow);
  6422. cb(Qcur, "Qcur", il);
  6423. cb(Kcur, "Kcur", il);
  6424. cb(Vcur, "Vcur", il);
  6425. cur = build_attn(inp_attn, gf,
  6426. model.layers[il].wo, NULL,
  6427. Qcur, Kcur, Vcur, nullptr, hparams.f_attention_scale, il);
  6428. }
  6429. cur = build_norm(cur,
  6430. model.layers[il].attn_post_norm, NULL,
  6431. LLM_NORM_RMS, il);
  6432. cb(cur, "attn_post_norm", il);
  6433. if (il == n_layer - 1) {
  6434. // skip computing output for unused tokens
  6435. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6436. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6437. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6438. }
  6439. ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
  6440. cb(sa_out, "sa_out", il);
  6441. cur = build_norm(sa_out,
  6442. model.layers[il].ffn_norm, NULL,
  6443. LLM_NORM_RMS, il);
  6444. cb(cur, "ffn_norm", il);
  6445. // feed-forward network
  6446. {
  6447. cur = build_ffn(cur,
  6448. model.layers[il].ffn_up, NULL, NULL,
  6449. model.layers[il].ffn_gate, NULL, NULL,
  6450. model.layers[il].ffn_down, NULL, NULL,
  6451. NULL,
  6452. LLM_FFN_GELU, LLM_FFN_PAR, il);
  6453. cb(cur, "ffn_out", il);
  6454. }
  6455. cur = build_norm(cur,
  6456. model.layers[il].ffn_post_norm, NULL,
  6457. LLM_NORM_RMS, -1);
  6458. cb(cur, "ffn_post_norm", -1);
  6459. cur = ggml_add(ctx0, cur, sa_out);
  6460. cur = build_cvec(cur, il);
  6461. cb(cur, "l_out", il);
  6462. // input for next layer
  6463. inpL = cur;
  6464. }
  6465. cur = inpL;
  6466. cur = build_norm(cur,
  6467. model.output_norm, NULL,
  6468. LLM_NORM_RMS, -1);
  6469. cb(cur, "result_norm", -1);
  6470. res->t_embd = cur;
  6471. // lm_head
  6472. cur = build_lora_mm(model.output, cur);
  6473. cb(cur, "result_output", -1);
  6474. res->t_logits = cur;
  6475. ggml_build_forward_expand(gf, cur);
  6476. }
  6477. };
  6478. // TODO: move up next to build_starcoder
  6479. struct llm_build_starcoder2 : public llm_graph_context {
  6480. llm_build_starcoder2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  6481. const int64_t n_embd_head = hparams.n_embd_head_v;
  6482. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6483. GGML_ASSERT(n_embd_head == hparams.n_rot);
  6484. ggml_tensor * cur;
  6485. ggml_tensor * inpL;
  6486. inpL = build_inp_embd(model.tok_embd);
  6487. // inp_pos - contains the positions
  6488. ggml_tensor * inp_pos = build_inp_pos();
  6489. auto * inp_attn = build_attn_inp_kv_unified();
  6490. for (int il = 0; il < n_layer; ++il) {
  6491. ggml_tensor * inpSA = inpL;
  6492. // norm
  6493. cur = build_norm(inpL,
  6494. model.layers[il].attn_norm, model.layers[il].attn_norm_b,
  6495. LLM_NORM, il);
  6496. cb(cur, "attn_norm", il);
  6497. // self-attention
  6498. {
  6499. // compute Q and K and RoPE them
  6500. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6501. cb(Qcur, "Qcur", il);
  6502. if (model.layers[il].bq) {
  6503. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  6504. cb(Qcur, "Qcur", il);
  6505. }
  6506. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6507. cb(Kcur, "Kcur", il);
  6508. if (model.layers[il].bk) {
  6509. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  6510. cb(Kcur, "Kcur", il);
  6511. }
  6512. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6513. cb(Vcur, "Vcur", il);
  6514. if (model.layers[il].bv) {
  6515. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  6516. cb(Vcur, "Vcur", il);
  6517. }
  6518. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6519. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6520. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6521. Qcur = ggml_rope_ext(
  6522. ctx0, Qcur, inp_pos, nullptr,
  6523. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6524. ext_factor, attn_factor, beta_fast, beta_slow
  6525. );
  6526. Kcur = ggml_rope_ext(
  6527. ctx0, Kcur, inp_pos, nullptr,
  6528. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6529. ext_factor, attn_factor, beta_fast, beta_slow
  6530. );
  6531. cb(Qcur, "Qcur", il);
  6532. cb(Kcur, "Kcur", il);
  6533. cb(Vcur, "Vcur", il);
  6534. cur = build_attn(inp_attn, gf,
  6535. model.layers[il].wo, model.layers[il].bo,
  6536. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6537. }
  6538. if (il == n_layer - 1) {
  6539. // skip computing output for unused tokens
  6540. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6541. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6542. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6543. }
  6544. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6545. cb(ffn_inp, "ffn_inp", il);
  6546. // feed-forward network
  6547. cur = build_norm(ffn_inp,
  6548. model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
  6549. LLM_NORM, il);
  6550. cb(cur, "ffn_norm", il);
  6551. cur = build_ffn(cur,
  6552. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  6553. NULL, NULL, NULL,
  6554. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  6555. NULL,
  6556. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  6557. cb(cur, "ffn_out", il);
  6558. cur = ggml_add(ctx0, cur, ffn_inp);
  6559. cur = build_cvec(cur, il);
  6560. cb(cur, "l_out", il);
  6561. // input for next layer
  6562. inpL = cur;
  6563. }
  6564. cur = inpL;
  6565. cur = build_norm(cur,
  6566. model.output_norm, model.output_norm_b,
  6567. LLM_NORM, -1);
  6568. cb(cur, "result_norm", -1);
  6569. res->t_embd = cur;
  6570. // lm_head
  6571. cur = build_lora_mm(model.output, cur);
  6572. cb(cur, "result_output", -1);
  6573. res->t_logits = cur;
  6574. ggml_build_forward_expand(gf, cur);
  6575. }
  6576. };
  6577. struct llm_build_mamba : public llm_graph_context {
  6578. const llama_model & model;
  6579. llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) {
  6580. ggml_tensor * cur;
  6581. ggml_tensor * inpL;
  6582. // {n_embd, n_tokens}
  6583. inpL = build_inp_embd(model.tok_embd);
  6584. ggml_tensor * state_copy = build_inp_s_copy();
  6585. ggml_tensor * state_mask = build_inp_s_mask();
  6586. for (int il = 0; il < n_layer; ++il) {
  6587. // norm
  6588. cur = build_norm(inpL,
  6589. model.layers[il].attn_norm, NULL,
  6590. LLM_NORM_RMS, il);
  6591. cb(cur, "attn_norm", il);
  6592. //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il);
  6593. cur = build_mamba_layer(gf, cur, state_copy, state_mask, ubatch, il);
  6594. if (il == n_layer - 1) {
  6595. // skip computing output for unused tokens
  6596. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6597. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6598. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6599. }
  6600. // residual
  6601. cur = ggml_add(ctx0, cur, inpL);
  6602. cur = build_cvec(cur, il);
  6603. cb(cur, "l_out", il);
  6604. // input for next layer
  6605. inpL = cur;
  6606. }
  6607. // final rmsnorm
  6608. cur = build_norm(inpL,
  6609. model.output_norm, NULL,
  6610. LLM_NORM_RMS, -1);
  6611. cb(cur, "result_norm", -1);
  6612. res->t_embd = cur;
  6613. // lm_head
  6614. cur = build_lora_mm(model.output, cur);
  6615. cb(cur, "result_output", -1);
  6616. res->t_logits = cur;
  6617. ggml_build_forward_expand(gf, cur);
  6618. }
  6619. // TODO: split
  6620. ggml_tensor * build_mamba_layer(
  6621. ggml_cgraph * gf,
  6622. ggml_tensor * cur,
  6623. ggml_tensor * state_copy,
  6624. ggml_tensor * state_mask,
  6625. const llama_ubatch & ubatch,
  6626. int il) const {
  6627. const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
  6628. const auto kv_head = kv_self->head;
  6629. const int64_t d_conv = hparams.ssm_d_conv;
  6630. const int64_t d_inner = hparams.ssm_d_inner;
  6631. const int64_t d_state = hparams.ssm_d_state;
  6632. const int64_t dt_rank = hparams.ssm_dt_rank;
  6633. const int64_t n_seqs = ubatch.n_seqs;
  6634. // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
  6635. const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
  6636. // Use the same RMS norm as the final layer norm
  6637. const float norm_rms_eps = hparams.f_norm_rms_eps;
  6638. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  6639. GGML_ASSERT(n_seqs != 0);
  6640. GGML_ASSERT(ubatch.equal_seqs);
  6641. GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
  6642. ggml_tensor * conv_states_all = kv_self->k_l[il];
  6643. ggml_tensor * ssm_states_all = kv_self->v_l[il];
  6644. // (ab)using the KV cache to store the states
  6645. ggml_tensor * conv = build_copy_mask_state(
  6646. gf, conv_states_all, state_copy, state_mask,
  6647. hparams.n_embd_k_s(), n_seqs);
  6648. conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
  6649. ggml_tensor * ssm = build_copy_mask_state(
  6650. gf, ssm_states_all, state_copy, state_mask,
  6651. hparams.n_embd_v_s(), n_seqs);
  6652. ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
  6653. // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
  6654. cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
  6655. // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
  6656. ggml_tensor * xz = build_lora_mm(model.layers[il].ssm_in, cur);
  6657. // split the above in two
  6658. // => {d_inner, n_seq_tokens, n_seqs}
  6659. ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
  6660. ggml_tensor * z = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz));
  6661. // conv
  6662. {
  6663. // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
  6664. ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
  6665. // copy last (d_conv - 1) columns back into the state cache
  6666. ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
  6667. ggml_build_forward_expand(gf,
  6668. ggml_cpy(ctx0, last_conv,
  6669. ggml_view_1d(ctx0, conv_states_all,
  6670. (d_conv - 1)*(d_inner)*(n_seqs),
  6671. kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
  6672. // 1D convolution
  6673. // The equivalent is to make a self-overlapping view of conv_x
  6674. // over d_conv columns at each stride in the 3rd dimension,
  6675. // then element-wise multiply that with the conv1d weight,
  6676. // then sum the elements of each row,
  6677. // (the last two steps are a dot product over rows (also doable with mul_mat))
  6678. // then permute away the ne[0] dimension,
  6679. // and then you're left with the resulting x tensor.
  6680. // For simultaneous sequences, all sequences need to have the same length.
  6681. x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
  6682. // bias
  6683. x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b);
  6684. x = ggml_silu(ctx0, x);
  6685. }
  6686. // ssm
  6687. {
  6688. // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
  6689. ggml_tensor * x_db = build_lora_mm(model.layers[il].ssm_x, x);
  6690. // split
  6691. ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
  6692. ggml_tensor * B = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
  6693. ggml_tensor * C = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
  6694. // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
  6695. if (ssm_dt_b_c_rms) {
  6696. dt = ggml_rms_norm(ctx0, dt, norm_rms_eps);
  6697. B = ggml_rms_norm(ctx0, B, norm_rms_eps);
  6698. C = ggml_rms_norm(ctx0, C, norm_rms_eps);
  6699. }
  6700. // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
  6701. dt = build_lora_mm(model.layers[il].ssm_dt, dt);
  6702. dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
  6703. // Custom operator to optimize the parallel associative scan
  6704. // as described in the Annex D of the Mamba paper.
  6705. // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
  6706. ggml_tensor * y_ssm = ggml_ssm_scan(ctx0, ssm, x, dt, model.layers[il].ssm_a, B, C);
  6707. // store last states
  6708. ggml_build_forward_expand(gf,
  6709. ggml_cpy(ctx0,
  6710. ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]),
  6711. ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
  6712. ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0);
  6713. // TODO: skip computing output earlier for unused tokens
  6714. // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
  6715. y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
  6716. y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z)));
  6717. // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
  6718. cur = build_lora_mm(model.layers[il].ssm_out, y);
  6719. }
  6720. // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
  6721. cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
  6722. //cb(cur, "mamba_out", il);
  6723. return cur;
  6724. }
  6725. };
  6726. struct llm_build_command_r : public llm_graph_context {
  6727. llm_build_command_r(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  6728. const int64_t n_embd_head = hparams.n_embd_head_v;
  6729. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6730. const float f_logit_scale = hparams.f_logit_scale;
  6731. ggml_tensor * cur;
  6732. ggml_tensor * inpL;
  6733. inpL = build_inp_embd(model.tok_embd);
  6734. // inp_pos - contains the positions
  6735. ggml_tensor * inp_pos = build_inp_pos();
  6736. auto * inp_attn = build_attn_inp_kv_unified();
  6737. for (int il = 0; il < n_layer; ++il) {
  6738. // norm
  6739. cur = build_norm(inpL,
  6740. model.layers[il].attn_norm, NULL,
  6741. LLM_NORM, il);
  6742. cb(cur, "attn_norm", il);
  6743. ggml_tensor * ffn_inp = cur;
  6744. // self-attention
  6745. {
  6746. // compute Q and K and RoPE them
  6747. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6748. cb(Qcur, "Qcur", il);
  6749. if (model.layers[il].bq) {
  6750. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  6751. cb(Qcur, "Qcur", il);
  6752. }
  6753. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6754. cb(Kcur, "Kcur", il);
  6755. if (model.layers[il].bk) {
  6756. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  6757. cb(Kcur, "Kcur", il);
  6758. }
  6759. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6760. cb(Vcur, "Vcur", il);
  6761. if (model.layers[il].bv) {
  6762. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  6763. cb(Vcur, "Vcur", il);
  6764. }
  6765. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6766. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6767. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6768. if (model.layers[il].attn_q_norm) {
  6769. Qcur = build_norm(Qcur,
  6770. model.layers[il].attn_q_norm,
  6771. NULL,
  6772. LLM_NORM, il);
  6773. cb(Qcur, "Qcur", il);
  6774. }
  6775. Qcur = ggml_rope_ext(
  6776. ctx0, Qcur, inp_pos, nullptr,
  6777. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6778. ext_factor, attn_factor, beta_fast, beta_slow
  6779. );
  6780. if (model.layers[il].attn_k_norm) {
  6781. Kcur = build_norm(Kcur,
  6782. model.layers[il].attn_k_norm,
  6783. NULL,
  6784. LLM_NORM, il);
  6785. cb(Kcur, "Kcur", il);
  6786. }
  6787. Kcur = ggml_rope_ext(
  6788. ctx0, Kcur, inp_pos, nullptr,
  6789. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6790. ext_factor, attn_factor, beta_fast, beta_slow
  6791. );
  6792. cb(Qcur, "Qcur", il);
  6793. cb(Kcur, "Kcur", il);
  6794. cb(Vcur, "Vcur", il);
  6795. cur = build_attn(inp_attn, gf,
  6796. model.layers[il].wo, model.layers[il].bo,
  6797. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6798. }
  6799. if (il == n_layer - 1) {
  6800. // skip computing output for unused tokens
  6801. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6802. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6803. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6804. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  6805. }
  6806. ggml_tensor * attn_out = cur;
  6807. // feed-forward network
  6808. {
  6809. cur = build_ffn(ffn_inp,
  6810. model.layers[il].ffn_up, NULL, NULL,
  6811. model.layers[il].ffn_gate, NULL, NULL,
  6812. model.layers[il].ffn_down, NULL, NULL,
  6813. NULL,
  6814. LLM_FFN_SILU, LLM_FFN_PAR, il);
  6815. cb(cur, "ffn_out", il);
  6816. }
  6817. // add together residual + FFN + self-attention
  6818. cur = ggml_add(ctx0, cur, inpL);
  6819. cur = ggml_add(ctx0, cur, attn_out);
  6820. cur = build_cvec(cur, il);
  6821. cb(cur, "l_out", il);
  6822. // input for next layer
  6823. inpL = cur;
  6824. }
  6825. cur = inpL;
  6826. cur = build_norm(cur,
  6827. model.output_norm, NULL,
  6828. LLM_NORM, -1);
  6829. cb(cur, "result_norm", -1);
  6830. res->t_embd = cur;
  6831. // lm_head
  6832. cur = build_lora_mm(model.output, cur);
  6833. if (f_logit_scale) {
  6834. cur = ggml_scale(ctx0, cur, f_logit_scale);
  6835. }
  6836. cb(cur, "result_output", -1);
  6837. res->t_logits = cur;
  6838. ggml_build_forward_expand(gf, cur);
  6839. }
  6840. };
  6841. struct llm_build_cohere2 : public llm_graph_context {
  6842. llm_build_cohere2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  6843. const int64_t n_embd_head = hparams.n_embd_head_v;
  6844. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6845. const float f_logit_scale = hparams.f_logit_scale;
  6846. ggml_tensor * cur;
  6847. ggml_tensor * inpL;
  6848. inpL = build_inp_embd(model.tok_embd);
  6849. // inp_pos - contains the positions
  6850. ggml_tensor * inp_pos = build_inp_pos();
  6851. auto * inp_attn = build_attn_inp_kv_unified();
  6852. for (int il = 0; il < n_layer; ++il) {
  6853. const bool is_swa = hparams.is_swa(il);
  6854. // norm
  6855. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
  6856. cb(cur, "attn_norm", il);
  6857. ggml_tensor * ffn_inp = cur;
  6858. // self-attention
  6859. {
  6860. // rope freq factors for 128k context
  6861. ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
  6862. // compute Q and K and RoPE them
  6863. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6864. cb(Qcur, "Qcur", il);
  6865. if (model.layers[il].bq) {
  6866. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  6867. cb(Qcur, "Qcur", il);
  6868. }
  6869. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6870. cb(Kcur, "Kcur", il);
  6871. if (model.layers[il].bk) {
  6872. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  6873. cb(Kcur, "Kcur", il);
  6874. }
  6875. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6876. cb(Vcur, "Vcur", il);
  6877. if (model.layers[il].bv) {
  6878. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  6879. cb(Vcur, "Vcur", il);
  6880. }
  6881. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6882. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6883. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6884. if (is_swa) {
  6885. Qcur = ggml_rope_ext(
  6886. ctx0, Qcur, inp_pos, rope_factors,
  6887. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6888. ext_factor, attn_factor, beta_fast, beta_slow
  6889. );
  6890. Kcur = ggml_rope_ext(
  6891. ctx0, Kcur, inp_pos, rope_factors,
  6892. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6893. ext_factor, attn_factor, beta_fast, beta_slow
  6894. );
  6895. }
  6896. cb(Qcur, "Qcur", il);
  6897. cb(Kcur, "Kcur", il);
  6898. cb(Vcur, "Vcur", il);
  6899. cur = build_attn(inp_attn, gf,
  6900. model.layers[il].wo, model.layers[il].bo,
  6901. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6902. }
  6903. if (il == n_layer - 1) {
  6904. // skip computing output for unused tokens
  6905. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6906. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6907. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6908. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  6909. }
  6910. ggml_tensor * attn_out = cur;
  6911. // feed-forward network
  6912. {
  6913. cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
  6914. NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR,
  6915. il);
  6916. cb(cur, "ffn_out", il);
  6917. }
  6918. // add together residual + FFN + self-attention
  6919. cur = ggml_add(ctx0, cur, inpL);
  6920. cur = ggml_add(ctx0, cur, attn_out);
  6921. cur = build_cvec(cur, il);
  6922. cb(cur, "l_out", il);
  6923. // input for next layer
  6924. inpL = cur;
  6925. }
  6926. cur = inpL;
  6927. cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1);
  6928. cb(cur, "result_norm", -1);
  6929. res->t_embd = cur;
  6930. // lm_head
  6931. cur = build_lora_mm(model.output, cur);
  6932. if (f_logit_scale) {
  6933. cur = ggml_scale(ctx0, cur, f_logit_scale);
  6934. }
  6935. cb(cur, "result_output", -1);
  6936. res->t_logits = cur;
  6937. ggml_build_forward_expand(gf, cur);
  6938. }
  6939. };
  6940. // ref: https://allenai.org/olmo
  6941. // based on the original build_llama() function, changes:
  6942. // * non-parametric layer norm
  6943. // * clamp qkv
  6944. // * removed bias
  6945. // * removed MoE
  6946. struct llm_build_olmo : public llm_graph_context {
  6947. llm_build_olmo(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  6948. const int64_t n_embd_head = hparams.n_embd_head_v;
  6949. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6950. GGML_ASSERT(n_embd_head == hparams.n_rot);
  6951. ggml_tensor * cur;
  6952. ggml_tensor * inpL;
  6953. inpL = build_inp_embd(model.tok_embd);
  6954. // inp_pos - contains the positions
  6955. ggml_tensor * inp_pos = build_inp_pos();
  6956. auto * inp_attn = build_attn_inp_kv_unified();
  6957. for (int il = 0; il < n_layer; ++il) {
  6958. ggml_tensor * inpSA = inpL;
  6959. // norm
  6960. cur = build_norm(inpL,
  6961. NULL, NULL,
  6962. LLM_NORM, il);
  6963. cb(cur, "attn_norm", il);
  6964. // self-attention
  6965. {
  6966. // compute Q and K and RoPE them
  6967. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6968. cb(Qcur, "Qcur", il);
  6969. if (hparams.f_clamp_kqv > 0.0f) {
  6970. Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  6971. cb(Qcur, "Qcur", il);
  6972. }
  6973. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6974. cb(Kcur, "Kcur", il);
  6975. if (hparams.f_clamp_kqv > 0.0f) {
  6976. Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  6977. cb(Kcur, "Kcur", il);
  6978. }
  6979. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6980. cb(Vcur, "Vcur", il);
  6981. if (hparams.f_clamp_kqv > 0.0f) {
  6982. Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  6983. cb(Vcur, "Vcur", il);
  6984. }
  6985. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6986. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6987. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6988. Qcur = ggml_rope_ext(
  6989. ctx0, Qcur, inp_pos, nullptr,
  6990. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6991. ext_factor, attn_factor, beta_fast, beta_slow
  6992. );
  6993. Kcur = ggml_rope_ext(
  6994. ctx0, Kcur, inp_pos, nullptr,
  6995. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6996. ext_factor, attn_factor, beta_fast, beta_slow
  6997. );
  6998. cb(Qcur, "Qcur", il);
  6999. cb(Kcur, "Kcur", il);
  7000. cb(Vcur, "Vcur", il);
  7001. cur = build_attn(inp_attn, gf,
  7002. model.layers[il].wo, nullptr,
  7003. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7004. }
  7005. if (il == n_layer - 1) {
  7006. // skip computing output for unused tokens
  7007. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7008. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7009. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7010. }
  7011. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7012. cb(ffn_inp, "ffn_inp", il);
  7013. // feed-forward network
  7014. cur = build_norm(ffn_inp,
  7015. NULL, NULL,
  7016. LLM_NORM, il);
  7017. cb(cur, "ffn_norm", il);
  7018. cur = build_ffn(cur,
  7019. model.layers[il].ffn_up, NULL, NULL,
  7020. model.layers[il].ffn_gate, NULL, NULL,
  7021. model.layers[il].ffn_down, NULL, NULL,
  7022. NULL,
  7023. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7024. cb(cur, "ffn_out", il);
  7025. cur = ggml_add(ctx0, cur, ffn_inp);
  7026. cb(cur, "ffn_out", il);
  7027. cur = build_cvec(cur, il);
  7028. cb(cur, "l_out", il);
  7029. // input for next layer
  7030. inpL = cur;
  7031. }
  7032. cur = inpL;
  7033. cur = build_norm(cur,
  7034. NULL, NULL,
  7035. LLM_NORM, -1);
  7036. cb(cur, "result_norm", -1);
  7037. res->t_embd = cur;
  7038. // lm_head
  7039. cur = build_lora_mm(model.output, cur);
  7040. cb(cur, "result_output", -1);
  7041. res->t_logits = cur;
  7042. ggml_build_forward_expand(gf, cur);
  7043. }
  7044. };
  7045. struct llm_build_olmo2 : public llm_graph_context {
  7046. llm_build_olmo2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  7047. const int64_t n_embd_head = hparams.n_embd_head_v;
  7048. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7049. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7050. ggml_tensor * cur;
  7051. ggml_tensor * inpL;
  7052. inpL = build_inp_embd(model.tok_embd);
  7053. // inp_pos - contains the positions
  7054. ggml_tensor * inp_pos = build_inp_pos();
  7055. auto * inp_attn = build_attn_inp_kv_unified();
  7056. for (int il = 0; il < n_layer; ++il) {
  7057. ggml_tensor * inpSA = inpL;
  7058. cur = inpL;
  7059. // self_attention
  7060. {
  7061. // compute Q and K and RoPE them
  7062. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7063. cb(Qcur, "Qcur", il);
  7064. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7065. cb(Kcur, "Kcur", il);
  7066. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7067. cb(Vcur, "Vcur", il);
  7068. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
  7069. LLM_NORM_RMS, il);
  7070. cb(Qcur, "Qcur_normed", il);
  7071. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
  7072. LLM_NORM_RMS, il);
  7073. cb(Kcur, "Kcur_normed", il);
  7074. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7075. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7076. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7077. Qcur = ggml_rope_ext(
  7078. ctx0, Qcur, inp_pos, nullptr,
  7079. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7080. ext_factor, attn_factor, beta_fast, beta_slow
  7081. );
  7082. Kcur = ggml_rope_ext(
  7083. ctx0, Kcur, inp_pos, nullptr,
  7084. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7085. ext_factor, attn_factor, beta_fast, beta_slow
  7086. );
  7087. cb(Qcur, "Qcur", il);
  7088. cb(Kcur, "Kcur", il);
  7089. cb(Vcur, "Vcur", il);
  7090. cur = build_attn(inp_attn, gf,
  7091. model.layers[il].wo, NULL,
  7092. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7093. }
  7094. cur = build_norm(cur,
  7095. model.layers[il].attn_post_norm, NULL,
  7096. LLM_NORM_RMS, il);
  7097. cb(cur, "attn_post_norm", il);
  7098. if (il == n_layer - 1) {
  7099. // skip computing output for unused tokens
  7100. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7101. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7102. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7103. }
  7104. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7105. cb(ffn_inp, "ffn_inp", il);
  7106. // feed-forward network
  7107. cur = build_ffn(ffn_inp,
  7108. model.layers[il].ffn_up, NULL, NULL,
  7109. model.layers[il].ffn_gate, NULL, NULL,
  7110. model.layers[il].ffn_down, NULL, NULL,
  7111. NULL,
  7112. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7113. cb(cur, "ffn_out", il);
  7114. cur = build_norm(cur,
  7115. model.layers[il].ffn_post_norm, NULL,
  7116. LLM_NORM_RMS, -1);
  7117. cb(cur, "ffn_post_norm", -1);
  7118. cur = ggml_add(ctx0, cur, ffn_inp);
  7119. cb(cur, "ffn_out", il);
  7120. cur = build_cvec(cur, il);
  7121. cb(cur, "l_out", il);
  7122. // input for next layer
  7123. inpL = cur;
  7124. }
  7125. cur = inpL;
  7126. cur = build_norm(cur,
  7127. model.output_norm, NULL,
  7128. LLM_NORM_RMS, -1);
  7129. cb(cur, "result_norm", -1);
  7130. res->t_embd = cur;
  7131. // lm_head
  7132. cur = build_lora_mm(model.output, cur);
  7133. cb(cur, "result_output", -1);
  7134. res->t_logits = cur;
  7135. ggml_build_forward_expand(gf, cur);
  7136. }
  7137. };
  7138. // based on the build_qwen2moe() function, changes:
  7139. // * removed shared experts
  7140. // * removed bias
  7141. // * added q, k norm
  7142. struct llm_build_olmoe : public llm_graph_context {
  7143. llm_build_olmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  7144. const int64_t n_embd_head = hparams.n_embd_head_v;
  7145. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7146. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7147. ggml_tensor * cur;
  7148. ggml_tensor * inpL;
  7149. inpL = build_inp_embd(model.tok_embd);
  7150. // inp_pos - contains the positions
  7151. ggml_tensor * inp_pos = build_inp_pos();
  7152. auto * inp_attn = build_attn_inp_kv_unified();
  7153. for (int il = 0; il < n_layer; ++il) {
  7154. ggml_tensor * inpSA = inpL;
  7155. // norm
  7156. cur = build_norm(inpL,
  7157. model.layers[il].attn_norm, NULL,
  7158. LLM_NORM_RMS, il);
  7159. cb(cur, "attn_norm", il);
  7160. // self_attention
  7161. {
  7162. // compute Q and K and RoPE them
  7163. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7164. cb(Qcur, "Qcur", il);
  7165. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7166. cb(Kcur, "Kcur", il);
  7167. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7168. cb(Vcur, "Vcur", il);
  7169. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
  7170. LLM_NORM_RMS, il);
  7171. cb(Qcur, "Qcur_normed", il);
  7172. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
  7173. LLM_NORM_RMS, il);
  7174. cb(Kcur, "Kcur_normed", il);
  7175. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7176. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7177. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7178. Qcur = ggml_rope_ext(
  7179. ctx0, Qcur, inp_pos, nullptr,
  7180. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7181. ext_factor, attn_factor, beta_fast, beta_slow
  7182. );
  7183. Kcur = ggml_rope_ext(
  7184. ctx0, Kcur, inp_pos, nullptr,
  7185. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7186. ext_factor, attn_factor, beta_fast, beta_slow
  7187. );
  7188. cb(Qcur, "Qcur", il);
  7189. cb(Kcur, "Kcur", il);
  7190. cb(Vcur, "Vcur", il);
  7191. cur = build_attn(inp_attn, gf,
  7192. model.layers[il].wo, NULL,
  7193. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7194. }
  7195. if (il == n_layer - 1) {
  7196. // skip computing output for unused tokens
  7197. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7198. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7199. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7200. }
  7201. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7202. cb(ffn_inp, "ffn_inp", il);
  7203. // MoE branch
  7204. cur = build_norm(ffn_inp,
  7205. model.layers[il].ffn_norm, NULL,
  7206. LLM_NORM_RMS, il);
  7207. cb(cur, "ffn_norm", il);
  7208. cur = build_moe_ffn(cur,
  7209. model.layers[il].ffn_gate_inp,
  7210. model.layers[il].ffn_up_exps,
  7211. model.layers[il].ffn_gate_exps,
  7212. model.layers[il].ffn_down_exps,
  7213. nullptr,
  7214. n_expert, n_expert_used,
  7215. LLM_FFN_SILU, false,
  7216. false, 0.0,
  7217. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  7218. il);
  7219. cb(cur, "ffn_moe_out", il);
  7220. cur = ggml_add(ctx0, cur, ffn_inp);
  7221. cur = build_cvec(cur, il);
  7222. cb(cur, "l_out", il);
  7223. // input for next layer
  7224. inpL = cur;
  7225. }
  7226. cur = inpL;
  7227. cur = build_norm(cur,
  7228. model.output_norm, NULL,
  7229. LLM_NORM_RMS, -1);
  7230. cb(cur, "result_norm", -1);
  7231. res->t_embd = cur;
  7232. // lm_head
  7233. cur = build_lora_mm(model.output, cur);
  7234. cb(cur, "result_output", -1);
  7235. res->t_logits = cur;
  7236. ggml_build_forward_expand(gf, cur);
  7237. }
  7238. };
  7239. struct llm_build_openelm : public llm_graph_context {
  7240. llm_build_openelm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  7241. const int64_t n_embd_head = hparams.n_embd_head_v;
  7242. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7243. ggml_tensor * cur;
  7244. ggml_tensor * inpL;
  7245. inpL = build_inp_embd(model.tok_embd);
  7246. // inp_pos - contains the positions
  7247. ggml_tensor * inp_pos = build_inp_pos();
  7248. auto * inp_attn = build_attn_inp_kv_unified();
  7249. for (int il = 0; il < n_layer; ++il) {
  7250. const int64_t n_head = hparams.n_head(il);
  7251. const int64_t n_head_kv = hparams.n_head_kv(il);
  7252. const int64_t n_head_qkv = 2*n_head_kv + n_head;
  7253. cur = inpL;
  7254. ggml_tensor * residual = cur;
  7255. // norm
  7256. cur = build_norm(inpL,
  7257. model.layers[il].attn_norm, NULL,
  7258. LLM_NORM_RMS, il);
  7259. cb(cur, "attn_norm", il);
  7260. // self-attention
  7261. {
  7262. cur = build_lora_mm(model.layers[il].wqkv, cur);
  7263. cb(cur, "wqkv", il);
  7264. cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
  7265. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0));
  7266. cb(Qcur, "Qcur", il);
  7267. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head));
  7268. cb(Kcur, "Kcur", il);
  7269. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
  7270. cb(Vcur, "Vcur", il);
  7271. Qcur = build_norm(Qcur,
  7272. model.layers[il].attn_q_norm, NULL,
  7273. LLM_NORM_RMS, il);
  7274. cb(Qcur, "Qcur", il);
  7275. Kcur = build_norm(Kcur,
  7276. model.layers[il].attn_k_norm, NULL,
  7277. LLM_NORM_RMS, il);
  7278. cb(Kcur, "Kcur", il);
  7279. Qcur = ggml_rope_ext(
  7280. ctx0, Qcur, inp_pos, NULL,
  7281. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7282. ext_factor, attn_factor, beta_fast, beta_slow
  7283. );
  7284. Kcur = ggml_rope_ext(
  7285. ctx0, Kcur, inp_pos, NULL,
  7286. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7287. ext_factor, attn_factor, beta_fast, beta_slow
  7288. );
  7289. cb(Qcur, "Qcur", il);
  7290. cb(Kcur, "Kcur", il);
  7291. cb(Qcur, "Vcur", il);
  7292. cur = build_attn(inp_attn, gf,
  7293. model.layers[il].wo, NULL,
  7294. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7295. }
  7296. if (il == n_layer - 1) {
  7297. // skip computing output for unused tokens
  7298. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7299. residual = ggml_get_rows(ctx0, residual, inp_out_ids);
  7300. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7301. }
  7302. ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
  7303. cb(ffn_inp, "ffn_inp", il);
  7304. // feed-forward network
  7305. {
  7306. cur = build_norm(ffn_inp,
  7307. model.layers[il].ffn_norm, NULL,
  7308. LLM_NORM_RMS, il);
  7309. cb(cur, "ffn_norm", il);
  7310. cur = build_ffn(cur,
  7311. model.layers[il].ffn_up, NULL, NULL,
  7312. model.layers[il].ffn_gate, NULL, NULL,
  7313. model.layers[il].ffn_down, NULL, NULL,
  7314. NULL,
  7315. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7316. cb(cur, "ffn_out", il);
  7317. }
  7318. cur = ggml_add(ctx0, cur, ffn_inp);
  7319. cur = build_cvec(cur, il);
  7320. cb(cur, "l_out", il);
  7321. inpL = cur;
  7322. }
  7323. cur = inpL;
  7324. // norm
  7325. cur = build_norm(cur,
  7326. model.output_norm, NULL,
  7327. LLM_NORM_RMS, -1);
  7328. cb(cur, "result_norm", -1);
  7329. res->t_embd = cur;
  7330. cur = build_lora_mm(model.output, cur);
  7331. cb(cur, "result_output", -1);
  7332. res->t_logits = cur;
  7333. ggml_build_forward_expand(gf, cur);
  7334. }
  7335. };
  7336. struct llm_build_gptneox : public llm_graph_context {
  7337. llm_build_gptneox(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  7338. const int64_t n_embd_head = hparams.n_embd_head_v;
  7339. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  7340. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7341. ggml_tensor * cur;
  7342. ggml_tensor * inpL;
  7343. inpL = build_inp_embd(model.tok_embd);
  7344. // inp_pos - contains the positions
  7345. ggml_tensor * inp_pos = build_inp_pos();
  7346. auto * inp_attn = build_attn_inp_kv_unified();
  7347. for (int il = 0; il < n_layer; ++il) {
  7348. cur = build_norm(inpL,
  7349. model.layers[il].attn_norm,
  7350. model.layers[il].attn_norm_b,
  7351. LLM_NORM, il);
  7352. cb(cur, "attn_norm", il);
  7353. // self-attention
  7354. {
  7355. cur = build_lora_mm(model.layers[il].wqkv, cur);
  7356. cb(cur, "wqkv", il);
  7357. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  7358. cb(cur, "bqkv", il);
  7359. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  7360. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  7361. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  7362. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7363. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7364. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7365. Qcur = ggml_rope_ext(
  7366. ctx0, Qcur, inp_pos, nullptr,
  7367. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7368. ext_factor, attn_factor, beta_fast, beta_slow
  7369. );
  7370. Kcur = ggml_rope_ext(
  7371. ctx0, Kcur, inp_pos, nullptr,
  7372. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7373. ext_factor, attn_factor, beta_fast, beta_slow
  7374. );
  7375. cb(Qcur, "Qcur", il);
  7376. cb(Kcur, "Kcur", il);
  7377. cb(Vcur, "Vcur", il);
  7378. cur = build_attn(inp_attn, gf,
  7379. model.layers[il].wo, model.layers[il].bo,
  7380. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7381. }
  7382. if (il == n_layer - 1) {
  7383. // skip computing output for unused tokens
  7384. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7385. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7386. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  7387. }
  7388. // ffn
  7389. if (hparams.use_par_res) {
  7390. // attention and ffn are computed in parallel
  7391. // x = x + attn(ln1(x)) + ffn(ln2(x))
  7392. ggml_tensor * attn_out = cur;
  7393. cur = build_norm(inpL,
  7394. model.layers[il].ffn_norm,
  7395. model.layers[il].ffn_norm_b,
  7396. LLM_NORM, il);
  7397. cb(cur, "ffn_norm", il);
  7398. cur = build_ffn(cur,
  7399. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  7400. NULL, NULL, NULL,
  7401. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  7402. NULL,
  7403. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  7404. cb(cur, "ffn_out", il);
  7405. cur = ggml_add(ctx0, cur, inpL);
  7406. cb(cur, "ffn_out", il);
  7407. cur = ggml_add(ctx0, cur, attn_out);
  7408. cur = build_cvec(cur, il);
  7409. cb(cur, "l_out", il);
  7410. // input for next layer
  7411. inpL = cur;
  7412. } else {
  7413. // attention and ffn are computed sequentially
  7414. // x = x + attn(ln1(x))
  7415. // x = x + ffn(ln2(x))
  7416. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  7417. cb(ffn_inp, "ffn_inp", il);
  7418. cur = build_norm(ffn_inp,
  7419. model.layers[il].ffn_norm,
  7420. model.layers[il].ffn_norm_b,
  7421. LLM_NORM, il);
  7422. cb(cur, "ffn_norm", il);
  7423. cur = build_ffn(cur,
  7424. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  7425. NULL, NULL, NULL,
  7426. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  7427. NULL,
  7428. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  7429. cb(cur, "ffn_out", il);
  7430. cur = ggml_add(ctx0, cur, ffn_inp);
  7431. cur = build_cvec(cur, il);
  7432. cb(cur, "l_out", il);
  7433. // input for next layer
  7434. inpL = cur;
  7435. }
  7436. }
  7437. cur = build_norm(inpL,
  7438. model.output_norm,
  7439. model.output_norm_b,
  7440. LLM_NORM, -1);
  7441. cb(cur, "result_norm", -1);
  7442. res->t_embd = cur;
  7443. cur = build_lora_mm(model.output, cur);
  7444. cb(cur, "result_output", -1);
  7445. res->t_logits = cur;
  7446. ggml_build_forward_expand(gf, cur);
  7447. }
  7448. };
  7449. struct llm_build_arctic : public llm_graph_context {
  7450. llm_build_arctic(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  7451. const int64_t n_embd_head = hparams.n_embd_head_v;
  7452. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7453. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7454. ggml_tensor * cur;
  7455. ggml_tensor * inpL;
  7456. inpL = build_inp_embd(model.tok_embd);
  7457. // inp_pos - contains the positions
  7458. ggml_tensor * inp_pos = build_inp_pos();
  7459. auto * inp_attn = build_attn_inp_kv_unified();
  7460. for (int il = 0; il < n_layer; ++il) {
  7461. ggml_tensor * inpSA = inpL;
  7462. // norm
  7463. cur = build_norm(inpL,
  7464. model.layers[il].attn_norm, NULL,
  7465. LLM_NORM_RMS, il);
  7466. cb(cur, "attn_norm", il);
  7467. // self-attention
  7468. {
  7469. // compute Q and K and RoPE them
  7470. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7471. cb(Qcur, "Qcur", il);
  7472. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7473. cb(Kcur, "Kcur", il);
  7474. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7475. cb(Vcur, "Vcur", il);
  7476. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7477. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7478. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7479. Qcur = ggml_rope_ext(
  7480. ctx0, Qcur, inp_pos, nullptr,
  7481. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7482. ext_factor, attn_factor, beta_fast, beta_slow
  7483. );
  7484. Kcur = ggml_rope_ext(
  7485. ctx0, Kcur, inp_pos, nullptr,
  7486. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7487. ext_factor, attn_factor, beta_fast, beta_slow
  7488. );
  7489. cb(Qcur, "Qcur", il);
  7490. cb(Kcur, "Kcur", il);
  7491. cb(Vcur, "Vcur", il);
  7492. cur = build_attn(inp_attn, gf,
  7493. model.layers[il].wo, NULL,
  7494. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7495. }
  7496. if (il == n_layer - 1) {
  7497. // skip computing output for unused tokens
  7498. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7499. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7500. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7501. }
  7502. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7503. cb(ffn_inp, "ffn_inp", il);
  7504. // feed-forward network
  7505. cur = build_norm(ffn_inp,
  7506. model.layers[il].ffn_norm, NULL,
  7507. LLM_NORM_RMS, il);
  7508. cb(cur, "ffn_norm", il);
  7509. cur = build_ffn(cur,
  7510. model.layers[il].ffn_up, NULL, NULL,
  7511. model.layers[il].ffn_gate, NULL, NULL,
  7512. model.layers[il].ffn_down, NULL, NULL,
  7513. NULL,
  7514. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7515. cb(cur, "ffn_out", il);
  7516. ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
  7517. cb(ffn_out, "ffn_out", il);
  7518. // MoE
  7519. cur = build_norm(inpSA,
  7520. model.layers[il].ffn_norm_exps, NULL,
  7521. LLM_NORM_RMS, il);
  7522. cb(cur, "ffn_norm_exps", il);
  7523. cur = build_moe_ffn(cur,
  7524. model.layers[il].ffn_gate_inp,
  7525. model.layers[il].ffn_up_exps,
  7526. model.layers[il].ffn_gate_exps,
  7527. model.layers[il].ffn_down_exps,
  7528. nullptr,
  7529. n_expert, n_expert_used,
  7530. LLM_FFN_SILU, true,
  7531. false, 0.0,
  7532. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  7533. il);
  7534. cb(cur, "ffn_moe_out", il);
  7535. cur = ggml_add(ctx0, cur, ffn_out);
  7536. cb(cur, "ffn_out", il);
  7537. cur = build_cvec(cur, il);
  7538. cb(cur, "l_out", il);
  7539. // input for next layer
  7540. inpL = cur;
  7541. }
  7542. cur = inpL;
  7543. cur = build_norm(cur,
  7544. model.output_norm, NULL,
  7545. LLM_NORM_RMS, -1);
  7546. cb(cur, "result_norm", -1);
  7547. res->t_embd = cur;
  7548. // lm_head
  7549. cur = build_lora_mm(model.output, cur);
  7550. cb(cur, "result_output", -1);
  7551. res->t_logits = cur;
  7552. ggml_build_forward_expand(gf, cur);
  7553. }
  7554. };
  7555. struct llm_build_deepseek : public llm_graph_context {
  7556. llm_build_deepseek(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  7557. const int64_t n_embd_head = hparams.n_embd_head_v;
  7558. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7559. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7560. ggml_tensor * cur;
  7561. ggml_tensor * inpL;
  7562. inpL = build_inp_embd(model.tok_embd);
  7563. // inp_pos - contains the positions
  7564. ggml_tensor * inp_pos = build_inp_pos();
  7565. auto * inp_attn = build_attn_inp_kv_unified();
  7566. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  7567. for (int il = 0; il < n_layer; ++il) {
  7568. ggml_tensor * inpSA = inpL;
  7569. // norm
  7570. cur = build_norm(inpL,
  7571. model.layers[il].attn_norm, NULL,
  7572. LLM_NORM_RMS, il);
  7573. cb(cur, "attn_norm", il);
  7574. // self-attention
  7575. {
  7576. // rope freq factors for llama3; may return nullptr for llama2 and other models
  7577. ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
  7578. // compute Q and K and RoPE them
  7579. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7580. cb(Qcur, "Qcur", il);
  7581. if (model.layers[il].bq) {
  7582. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  7583. cb(Qcur, "Qcur", il);
  7584. }
  7585. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7586. cb(Kcur, "Kcur", il);
  7587. if (model.layers[il].bk) {
  7588. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  7589. cb(Kcur, "Kcur", il);
  7590. }
  7591. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7592. cb(Vcur, "Vcur", il);
  7593. if (model.layers[il].bv) {
  7594. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  7595. cb(Vcur, "Vcur", il);
  7596. }
  7597. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7598. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7599. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7600. Qcur = ggml_rope_ext(
  7601. ctx0, Qcur, inp_pos, rope_factors,
  7602. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7603. ext_factor, attn_factor, beta_fast, beta_slow
  7604. );
  7605. Kcur = ggml_rope_ext(
  7606. ctx0, Kcur, inp_pos, rope_factors,
  7607. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7608. ext_factor, attn_factor, beta_fast, beta_slow
  7609. );
  7610. cb(Qcur, "Qcur", il);
  7611. cb(Kcur, "Kcur", il);
  7612. cb(Vcur, "Vcur", il);
  7613. cur = build_attn(inp_attn, gf,
  7614. model.layers[il].wo, model.layers[il].bo,
  7615. Qcur, Kcur, Vcur, nullptr, kq_scale, il);
  7616. }
  7617. if (il == n_layer - 1) {
  7618. // skip computing output for unused tokens
  7619. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7620. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7621. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7622. }
  7623. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7624. cb(ffn_inp, "ffn_inp", il);
  7625. cur = build_norm(ffn_inp,
  7626. model.layers[il].ffn_norm, NULL,
  7627. LLM_NORM_RMS, il);
  7628. cb(cur, "ffn_norm", il);
  7629. if ((uint32_t) il < hparams.n_layer_dense_lead) {
  7630. cur = build_ffn(cur,
  7631. model.layers[il].ffn_up, NULL, NULL,
  7632. model.layers[il].ffn_gate, NULL, NULL,
  7633. model.layers[il].ffn_down, NULL, NULL,
  7634. NULL,
  7635. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7636. cb(cur, "ffn_out", il);
  7637. } else {
  7638. // MoE branch
  7639. ggml_tensor * moe_out =
  7640. build_moe_ffn(cur,
  7641. model.layers[il].ffn_gate_inp,
  7642. model.layers[il].ffn_up_exps,
  7643. model.layers[il].ffn_gate_exps,
  7644. model.layers[il].ffn_down_exps,
  7645. nullptr,
  7646. n_expert, n_expert_used,
  7647. LLM_FFN_SILU, false,
  7648. false, hparams.expert_weights_scale,
  7649. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  7650. il);
  7651. cb(moe_out, "ffn_moe_out", il);
  7652. // FFN shared expert
  7653. {
  7654. ggml_tensor * ffn_shexp = build_ffn(cur,
  7655. model.layers[il].ffn_up_shexp, NULL, NULL,
  7656. model.layers[il].ffn_gate_shexp, NULL, NULL,
  7657. model.layers[il].ffn_down_shexp, NULL, NULL,
  7658. NULL,
  7659. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7660. cb(ffn_shexp, "ffn_shexp", il);
  7661. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  7662. cb(cur, "ffn_out", il);
  7663. }
  7664. }
  7665. cur = ggml_add(ctx0, cur, ffn_inp);
  7666. cur = build_cvec(cur, il);
  7667. cb(cur, "l_out", il);
  7668. // input for next layer
  7669. inpL = cur;
  7670. }
  7671. cur = inpL;
  7672. cur = build_norm(cur,
  7673. model.output_norm, NULL,
  7674. LLM_NORM_RMS, -1);
  7675. cb(cur, "result_norm", -1);
  7676. res->t_embd = cur;
  7677. // lm_head
  7678. cur = build_lora_mm(model.output, cur);
  7679. cb(cur, "result_output", -1);
  7680. res->t_logits = cur;
  7681. ggml_build_forward_expand(gf, cur);
  7682. }
  7683. };
  7684. struct llm_build_deepseek2 : public llm_graph_context {
  7685. llm_build_deepseek2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  7686. bool is_lite = (hparams.n_layer == 27);
  7687. // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
  7688. // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
  7689. const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
  7690. const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
  7691. const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
  7692. const uint32_t n_embd_head_qk_rope = hparams.n_rot;
  7693. const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  7694. const uint32_t kv_lora_rank = hparams.n_lora_kv;
  7695. ggml_tensor * cur;
  7696. ggml_tensor * inpL;
  7697. // {n_embd, n_tokens}
  7698. inpL = build_inp_embd(model.tok_embd);
  7699. // inp_pos - contains the positions
  7700. ggml_tensor * inp_pos = build_inp_pos();
  7701. auto * inp_attn = build_attn_inp_kv_unified();
  7702. for (int il = 0; il < n_layer; ++il) {
  7703. ggml_tensor * inpSA = inpL;
  7704. // norm
  7705. cur = build_norm(inpL,
  7706. model.layers[il].attn_norm, NULL,
  7707. LLM_NORM_RMS, il);
  7708. cb(cur, "attn_norm", il);
  7709. // self_attention
  7710. {
  7711. ggml_tensor * q = NULL;
  7712. if (!is_lite) {
  7713. // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
  7714. q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
  7715. cb(q, "q", il);
  7716. q = build_norm(q,
  7717. model.layers[il].attn_q_a_norm, NULL,
  7718. LLM_NORM_RMS, il);
  7719. cb(q, "q", il);
  7720. // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
  7721. q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
  7722. cb(q, "q", il);
  7723. } else {
  7724. q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
  7725. cb(q, "q", il);
  7726. }
  7727. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  7728. ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
  7729. ggml_row_size(q->type, hparams.n_embd_head_k),
  7730. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  7731. 0);
  7732. cb(q_nope, "q_nope", il);
  7733. // and {n_head * n_embd_head_qk_rope, n_tokens}
  7734. ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
  7735. ggml_row_size(q->type, hparams.n_embd_head_k),
  7736. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  7737. ggml_row_size(q->type, n_embd_head_qk_nope));
  7738. cb(q_pe, "q_pe", il);
  7739. // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
  7740. ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
  7741. cb(kv_pe_compresseed, "kv_pe_compresseed", il);
  7742. // split into {kv_lora_rank, n_tokens}
  7743. ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
  7744. kv_pe_compresseed->nb[1],
  7745. 0);
  7746. cb(kv_compressed, "kv_compressed", il);
  7747. // and {n_embd_head_qk_rope, n_tokens}
  7748. ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
  7749. kv_pe_compresseed->nb[1],
  7750. kv_pe_compresseed->nb[1],
  7751. ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
  7752. cb(k_pe, "k_pe", il);
  7753. // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
  7754. kv_compressed = ggml_cont(ctx0, kv_compressed);
  7755. kv_compressed = build_norm(kv_compressed,
  7756. model.layers[il].attn_kv_a_norm, NULL,
  7757. LLM_NORM_RMS, il);
  7758. cb(kv_compressed, "kv_compressed", il);
  7759. // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
  7760. ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
  7761. cb(kv, "kv", il);
  7762. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  7763. ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
  7764. ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
  7765. ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  7766. 0);
  7767. cb(k_nope, "k_nope", il);
  7768. // and {n_head * n_embd_head_v, n_tokens}
  7769. ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
  7770. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  7771. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
  7772. ggml_row_size(kv->type, (n_embd_head_qk_nope)));
  7773. cb(v_states, "v_states", il);
  7774. v_states = ggml_cont(ctx0, v_states);
  7775. cb(v_states, "v_states", il);
  7776. v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
  7777. ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
  7778. 0);
  7779. cb(v_states, "v_states", il);
  7780. q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
  7781. q_pe = ggml_rope_ext(
  7782. ctx0, q_pe, inp_pos, nullptr,
  7783. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7784. ext_factor, attn_factor_scaled, beta_fast, beta_slow
  7785. );
  7786. cb(q_pe, "q_pe", il);
  7787. // shared RoPE key
  7788. k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
  7789. k_pe = ggml_rope_ext(
  7790. ctx0, k_pe, inp_pos, nullptr,
  7791. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7792. ext_factor, attn_factor_scaled, beta_fast, beta_slow
  7793. );
  7794. cb(k_pe, "k_pe", il);
  7795. ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
  7796. cb(q_states, "q_states", il);
  7797. ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
  7798. cb(k_states, "k_states", il);
  7799. cur = build_attn(inp_attn, gf,
  7800. model.layers[il].wo, NULL,
  7801. q_states, k_states, v_states, nullptr, kq_scale, il);
  7802. }
  7803. if (il == n_layer - 1) {
  7804. // skip computing output for unused tokens
  7805. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7806. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7807. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7808. }
  7809. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7810. cb(ffn_inp, "ffn_inp", il);
  7811. cur = build_norm(ffn_inp,
  7812. model.layers[il].ffn_norm, NULL,
  7813. LLM_NORM_RMS, il);
  7814. cb(cur, "ffn_norm", il);
  7815. if ((uint32_t) il < hparams.n_layer_dense_lead) {
  7816. cur = build_ffn(cur,
  7817. model.layers[il].ffn_up, NULL, NULL,
  7818. model.layers[il].ffn_gate, NULL, NULL,
  7819. model.layers[il].ffn_down, NULL, NULL,
  7820. NULL,
  7821. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7822. cb(cur, "ffn_out", il);
  7823. } else {
  7824. // MoE branch
  7825. ggml_tensor * moe_out =
  7826. build_moe_ffn(cur,
  7827. model.layers[il].ffn_gate_inp,
  7828. model.layers[il].ffn_up_exps,
  7829. model.layers[il].ffn_gate_exps,
  7830. model.layers[il].ffn_down_exps,
  7831. model.layers[il].ffn_exp_probs_b,
  7832. n_expert, n_expert_used,
  7833. LLM_FFN_SILU, hparams.expert_weights_norm,
  7834. true, hparams.expert_weights_scale,
  7835. (llama_expert_gating_func_type) hparams.expert_gating_func,
  7836. il);
  7837. cb(moe_out, "ffn_moe_out", il);
  7838. // FFN shared expert
  7839. {
  7840. ggml_tensor * ffn_shexp = build_ffn(cur,
  7841. model.layers[il].ffn_up_shexp, NULL, NULL,
  7842. model.layers[il].ffn_gate_shexp, NULL, NULL,
  7843. model.layers[il].ffn_down_shexp, NULL, NULL,
  7844. NULL,
  7845. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7846. cb(ffn_shexp, "ffn_shexp", il);
  7847. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  7848. cb(cur, "ffn_out", il);
  7849. }
  7850. }
  7851. cur = ggml_add(ctx0, cur, ffn_inp);
  7852. cur = build_cvec(cur, il);
  7853. cb(cur, "l_out", il);
  7854. // input for next layer
  7855. inpL = cur;
  7856. }
  7857. cur = inpL;
  7858. cur = build_norm(cur,
  7859. model.output_norm, NULL,
  7860. LLM_NORM_RMS, -1);
  7861. cb(cur, "result_norm", -1);
  7862. res->t_embd = cur;
  7863. // lm_head
  7864. cur = ggml_mul_mat(ctx0, model.output, cur);
  7865. cb(cur, "result_output", -1);
  7866. res->t_logits = cur;
  7867. ggml_build_forward_expand(gf, cur);
  7868. }
  7869. };
  7870. struct llm_build_bitnet : public llm_graph_context {
  7871. llm_build_bitnet(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  7872. const int64_t n_embd_head = hparams.n_embd_head_v;
  7873. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7874. ggml_tensor * cur;
  7875. ggml_tensor * inpL;
  7876. inpL = build_inp_embd(model.tok_embd);
  7877. // inp_pos - contains the positions
  7878. ggml_tensor * inp_pos = build_inp_pos();
  7879. auto * inp_attn = build_attn_inp_kv_unified();
  7880. for (int il = 0; il < n_layer; ++il) {
  7881. ggml_tensor * inpSA = inpL;
  7882. cur = build_norm(inpL,
  7883. model.layers[il].attn_norm, NULL,
  7884. LLM_NORM_RMS, il);
  7885. cb(cur, "attn_norm", il);
  7886. // self-attention
  7887. {
  7888. // compute Q and K and RoPE them
  7889. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7890. if (model.layers[il].wq_scale) {
  7891. Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
  7892. }
  7893. cb(Qcur, "Qcur", il);
  7894. if (model.layers[il].bq) {
  7895. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  7896. cb(Qcur, "Qcur", il);
  7897. }
  7898. // B1.K
  7899. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7900. if (model.layers[il].wk_scale) {
  7901. Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
  7902. }
  7903. cb(Kcur, "Kcur", il);
  7904. if (model.layers[il].bk) {
  7905. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  7906. cb(Kcur, "Kcur", il);
  7907. }
  7908. // B1.V
  7909. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7910. if (model.layers[il].wv_scale) {
  7911. Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
  7912. }
  7913. cb(Vcur, "Vcur", il);
  7914. if (model.layers[il].bv) {
  7915. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  7916. cb(Vcur, "Vcur", il);
  7917. }
  7918. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7919. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7920. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7921. Qcur = ggml_rope_ext(
  7922. ctx0, Qcur, inp_pos, nullptr,
  7923. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7924. ext_factor, attn_factor, beta_fast, beta_slow
  7925. );
  7926. Kcur = ggml_rope_ext(
  7927. ctx0, Kcur, inp_pos, nullptr,
  7928. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7929. ext_factor, attn_factor, beta_fast, beta_slow
  7930. );
  7931. cb(Qcur, "Qcur", il);
  7932. cb(Kcur, "Kcur", il);
  7933. cb(Vcur, "Vcur", il);
  7934. cur = build_attn(inp_attn, gf,
  7935. NULL, NULL,
  7936. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7937. cur = build_norm(cur,
  7938. model.layers[il].attn_sub_norm, NULL,
  7939. LLM_NORM_RMS, il);
  7940. cb(cur, "attn_sub_norm", il);
  7941. cur = build_lora_mm(model.layers[il].wo, cur);
  7942. if (model.layers[il].wo_scale) {
  7943. cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
  7944. }
  7945. if (model.layers[il].bo) {
  7946. cur = ggml_add(ctx0, cur, model.layers[il].bo);
  7947. }
  7948. cb(cur, "attn_o_out", il);
  7949. }
  7950. if (il == n_layer - 1) {
  7951. // skip computing output for unused tokens
  7952. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7953. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7954. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7955. }
  7956. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7957. cb(ffn_inp, "ffn_inp", il);
  7958. // feed-forward forward
  7959. cur = build_norm(ffn_inp,
  7960. model.layers[il].ffn_norm, NULL,
  7961. LLM_NORM_RMS, il);
  7962. cb(cur, "ffn_norm", il);
  7963. cur = build_ffn(cur,
  7964. model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale,
  7965. model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
  7966. NULL, NULL, NULL,
  7967. NULL,
  7968. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7969. cb(cur, "ffn_sub_out", il);
  7970. cur = build_norm(cur,
  7971. model.layers[il].ffn_sub_norm, NULL,
  7972. LLM_NORM_RMS, il);
  7973. cb(cur, "ffn_sub_norm", il);
  7974. cur = build_lora_mm(model.layers[il].ffn_down, cur);
  7975. if (model.layers[il].ffn_down_scale) {
  7976. cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
  7977. }
  7978. cb(cur, "ffn_down", il);
  7979. cur = ggml_add(ctx0, cur, ffn_inp);
  7980. cb(cur, "l_out", il);
  7981. // input for next layer
  7982. inpL = cur;
  7983. }
  7984. cur = inpL;
  7985. cur = build_norm(cur,
  7986. model.output_norm, NULL,
  7987. LLM_NORM_RMS, -1);
  7988. cb(cur, "result_norm", -1);
  7989. res->t_embd = cur;
  7990. // lm_head
  7991. // FIXME: do not use model.tok_embd directly, duplicate as model.output
  7992. cur = build_lora_mm(model.tok_embd, cur);
  7993. cb(cur, "result_output", -1);
  7994. res->t_logits = cur;
  7995. ggml_build_forward_expand(gf, cur);
  7996. }
  7997. };
  7998. struct llm_build_t5_enc : public llm_graph_context {
  7999. llm_build_t5_enc(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  8000. const int64_t n_embd_head = hparams.n_embd_head_v;
  8001. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8002. ggml_tensor * cur;
  8003. ggml_tensor * inpL;
  8004. inpL = build_inp_embd(model.tok_embd);
  8005. ggml_tensor * pos_bucket_enc = build_inp_pos_bucket_enc();
  8006. auto * inp_attn = build_attn_inp_no_cache();
  8007. for (int il = 0; il < n_layer; ++il) {
  8008. ggml_tensor * inpSA = inpL;
  8009. // norm
  8010. cur = build_norm(inpL,
  8011. model.layers[il].attn_norm_enc, NULL,
  8012. LLM_NORM_RMS, il);
  8013. cb(cur, "attn_norm", il);
  8014. // self-attention
  8015. {
  8016. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur);
  8017. cb(Qcur, "Qcur", il);
  8018. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur);
  8019. cb(Kcur, "Kcur", il);
  8020. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur);
  8021. cb(Vcur, "Vcur", il);
  8022. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8023. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8024. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8025. ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
  8026. ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b);
  8027. cur = build_attn(inp_attn, gf,
  8028. model.layers[il].wo_enc, nullptr,
  8029. Qcur, Kcur, Vcur, kq_b, 1.0f, il);
  8030. cb(cur, "kqv_out", il);
  8031. }
  8032. if (il == n_layer - 1) {
  8033. // skip computing output for unused tokens
  8034. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8035. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8036. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  8037. }
  8038. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  8039. cb(ffn_inp, "ffn_inp", il);
  8040. // feed-forward network
  8041. {
  8042. cur = build_norm(ffn_inp,
  8043. model.layers[il].ffn_norm_enc, NULL,
  8044. LLM_NORM_RMS, il);
  8045. cb(cur, "ffn_norm", il);
  8046. // T5 uses relu, flan-T5 uses gelu-gated
  8047. cur = build_ffn(cur,
  8048. model.layers[il].ffn_up_enc, NULL, NULL,
  8049. model.layers[il].ffn_gate_enc, NULL, NULL,
  8050. model.layers[il].ffn_down_enc, NULL, NULL,
  8051. NULL,
  8052. model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
  8053. model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
  8054. il);
  8055. cb(cur, "ffn_out", il);
  8056. }
  8057. cur = ggml_add(ctx0, cur, ffn_inp);
  8058. cb(cur, "ffn_out", il);
  8059. cur = build_cvec(cur, il);
  8060. cb(cur, "l_out", il);
  8061. // input for next layer
  8062. inpL = cur;
  8063. }
  8064. cur = inpL;
  8065. cb(cur, "result_embd", -1);
  8066. cur = build_norm(cur,
  8067. model.output_norm_enc, NULL,
  8068. LLM_NORM_RMS, -1);
  8069. cb(cur, "result_norm", -1);
  8070. res->t_embd = cur;
  8071. ggml_build_forward_expand(gf, cur);
  8072. }
  8073. };
  8074. struct llm_build_t5_dec : public llm_graph_context {
  8075. llm_build_t5_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  8076. const int64_t n_embd_head = hparams.n_embd_head_v;
  8077. //const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  8078. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8079. ggml_tensor * cur;
  8080. ggml_tensor * inpL;
  8081. inpL = build_inp_embd(model.tok_embd);
  8082. ggml_tensor * embd_enc = build_inp_cross_embd();
  8083. ggml_tensor * pos_bucket_dec = build_inp_pos_bucket_dec();
  8084. const int64_t n_outputs_enc = embd_enc->ne[1];
  8085. auto * inp_attn_self = build_attn_inp_kv_unified();
  8086. auto * inp_attn_cross = build_attn_inp_cross();
  8087. for (int il = 0; il < n_layer; ++il) {
  8088. ggml_tensor * inpSA = inpL;
  8089. // norm
  8090. cur = build_norm(inpL,
  8091. model.layers[il].attn_norm, NULL,
  8092. LLM_NORM_RMS, il);
  8093. cb(cur, "attn_norm", il);
  8094. // self-attention
  8095. {
  8096. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8097. cb(Qcur, "Qcur", il);
  8098. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8099. cb(Kcur, "Kcur", il);
  8100. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8101. cb(Vcur, "Vcur", il);
  8102. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8103. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8104. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8105. ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
  8106. ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b);
  8107. cur = build_attn(inp_attn_self, gf,
  8108. model.layers[il].wo, model.layers[il].bo,
  8109. Qcur, Kcur, Vcur, kq_b, 1.0f, il);
  8110. cb(cur, "kqv_out", il);
  8111. }
  8112. cur = ggml_add(ctx0, cur, inpSA);
  8113. cb(cur, "cross_inp", il);
  8114. ggml_tensor * inpCA = cur;
  8115. // norm
  8116. cur = build_norm(cur,
  8117. model.layers[il].attn_norm_cross, NULL,
  8118. LLM_NORM_RMS, il);
  8119. cb(cur, "attn_norm_cross", il);
  8120. // cross-attention
  8121. {
  8122. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur);
  8123. cb(Qcur, "Qcur", il);
  8124. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc);
  8125. cb(Kcur, "Kcur", il);
  8126. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc);
  8127. cb(Vcur, "Vcur", il);
  8128. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8129. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
  8130. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_outputs_enc);
  8131. cur = build_attn(inp_attn_cross, gf,
  8132. model.layers[il].wo_cross, nullptr,
  8133. Qcur, Kcur, Vcur, nullptr, 1.0f, il);
  8134. cb(cur, "kqv_out", il);
  8135. //ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
  8136. //ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
  8137. //ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
  8138. //cb(kq, "kq", il);
  8139. //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
  8140. //cb(kq, "kq_soft_max_ext", il);
  8141. //ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
  8142. //cb(v, "v", il);
  8143. //ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
  8144. //cb(kqv, "kqv", il);
  8145. //ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
  8146. //cb(kqv_merged, "kqv_merged", il);
  8147. //cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
  8148. //cb(cur, "kqv_merged_cont", il);
  8149. //ggml_build_forward_expand(gf, cur);
  8150. //cur = build_lora_mm(model.layers[il].wo_cross, cur);
  8151. //cb(cur, "kqv_out", il);
  8152. }
  8153. if (il == n_layer - 1) {
  8154. // skip computing output for unused tokens
  8155. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8156. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8157. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  8158. inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
  8159. }
  8160. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
  8161. cb(ffn_inp, "ffn_inp", il);
  8162. // feed-forward network
  8163. {
  8164. cur = build_norm(ffn_inp,
  8165. model.layers[il].ffn_norm, NULL,
  8166. LLM_NORM_RMS, il);
  8167. cb(cur, "ffn_norm", il);
  8168. // T5 uses relu, flan-T5 uses gelu-gated
  8169. cur = build_ffn(cur,
  8170. model.layers[il].ffn_up, NULL, NULL,
  8171. model.layers[il].ffn_gate, NULL, NULL,
  8172. model.layers[il].ffn_down, NULL, NULL,
  8173. NULL,
  8174. model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
  8175. model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
  8176. il);
  8177. cb(cur, "ffn_out", il);
  8178. }
  8179. cur = ggml_add(ctx0, cur, ffn_inp);
  8180. cb(cur, "ffn_out", il);
  8181. cur = build_cvec(cur, il);
  8182. cb(cur, "l_out", il);
  8183. // input for next layer
  8184. inpL = cur;
  8185. }
  8186. cur = inpL;
  8187. cb(cur, "result_embd", -1);
  8188. cur = build_norm(cur,
  8189. model.output_norm, NULL,
  8190. LLM_NORM_RMS, -1);
  8191. cb(cur, "result_norm", -1);
  8192. res->t_embd = cur;
  8193. // lm_head
  8194. cur = build_lora_mm(model.output, cur);
  8195. cb(cur, "result_output", -1);
  8196. res->t_logits = cur;
  8197. ggml_build_forward_expand(gf, cur);
  8198. }
  8199. };
  8200. struct llm_build_jais : public llm_graph_context {
  8201. llm_build_jais(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  8202. const int64_t n_embd_head = hparams.n_embd_head_v;
  8203. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  8204. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8205. ggml_tensor * cur;
  8206. ggml_tensor * inpL;
  8207. inpL = build_inp_embd(model.tok_embd);
  8208. auto * inp_attn = build_attn_inp_kv_unified();
  8209. for (int il = 0; il < n_layer; ++il) {
  8210. cur = build_norm(inpL,
  8211. model.layers[il].attn_norm,
  8212. model.layers[il].attn_norm_b,
  8213. LLM_NORM, il);
  8214. cb(cur, "attn_norm", il);
  8215. // self-attention
  8216. {
  8217. cur = build_lora_mm(model.layers[il].wqkv, cur);
  8218. cb(cur, "wqkv", il);
  8219. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  8220. cb(cur, "bqkv", il);
  8221. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd)));
  8222. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd)));
  8223. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa)));
  8224. cb(Qcur, "Qcur", il);
  8225. cb(Kcur, "Kcur", il);
  8226. cb(Vcur, "Vcur", il);
  8227. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8228. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8229. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8230. cur = build_attn(inp_attn, gf,
  8231. model.layers[il].wo, model.layers[il].bo,
  8232. Qcur, Kcur, Vcur, nullptr, 1.0f/float(n_embd_head), il);
  8233. }
  8234. if (il == n_layer - 1) {
  8235. // skip computing output for unused tokens
  8236. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8237. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8238. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  8239. }
  8240. // add the input
  8241. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  8242. cb(ffn_inp, "ffn_inp", il);
  8243. // FF
  8244. {
  8245. cur = build_norm(ffn_inp,
  8246. model.layers[il].ffn_norm,
  8247. model.layers[il].ffn_norm_b,
  8248. LLM_NORM, il);
  8249. cb(cur, "ffn_norm", il);
  8250. cur = build_ffn(cur,
  8251. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  8252. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  8253. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  8254. NULL,
  8255. LLM_FFN_SILU, LLM_FFN_PAR, il);
  8256. cb(cur, "ffn_out", il);
  8257. }
  8258. inpL = ggml_add(ctx0, cur, ffn_inp);
  8259. cb(inpL, "l_out", il);
  8260. }
  8261. cur = build_norm(inpL,
  8262. model.output_norm,
  8263. model.output_norm_b,
  8264. LLM_NORM, -1);
  8265. cb(cur, "result_norm", -1);
  8266. res->t_embd = cur;
  8267. cur = build_lora_mm(model.output, cur);
  8268. cb(cur, "result_output", -1);
  8269. res->t_logits = cur;
  8270. ggml_build_forward_expand(gf, cur);
  8271. }
  8272. };
  8273. struct llm_build_chatglm : public llm_graph_context {
  8274. llm_build_chatglm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  8275. const int64_t n_embd_head = hparams.n_embd_head_v;
  8276. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  8277. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8278. ggml_tensor * cur;
  8279. ggml_tensor * inpL;
  8280. inpL = build_inp_embd(model.tok_embd);
  8281. // inp_pos - contains the positions
  8282. ggml_tensor * inp_pos = build_inp_pos();
  8283. auto * inp_attn = build_attn_inp_kv_unified();
  8284. for (int il = 0; il < n_layer; ++il) {
  8285. ggml_tensor * inpSA = inpL;
  8286. cur = build_norm(inpL,
  8287. model.layers[il].attn_norm,
  8288. NULL,
  8289. LLM_NORM_RMS, il);
  8290. cb(cur, "attn_norm", il);
  8291. // self-attention
  8292. {
  8293. ggml_tensor * Qcur = nullptr;
  8294. ggml_tensor * Kcur = nullptr;
  8295. ggml_tensor * Vcur = nullptr;
  8296. if (model.layers[il].wqkv == nullptr) {
  8297. Qcur = build_lora_mm(model.layers[il].wq, cur);
  8298. if (model.layers[il].bq) {
  8299. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  8300. }
  8301. Kcur = build_lora_mm(model.layers[il].wk, cur);
  8302. if (model.layers[il].bk) {
  8303. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  8304. }
  8305. Vcur = build_lora_mm(model.layers[il].wv, cur);
  8306. if (model.layers[il].bv) {
  8307. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  8308. }
  8309. } else {
  8310. cur = build_lora_mm(model.layers[il].wqkv, cur);
  8311. cb(cur, "wqkv", il);
  8312. if (model.layers[il].bqkv) {
  8313. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  8314. cb(cur, "bqkv", il);
  8315. }
  8316. Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  8317. Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  8318. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  8319. }
  8320. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8321. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8322. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8323. //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
  8324. Qcur = ggml_rope_ext(
  8325. ctx0, Qcur, inp_pos, nullptr,
  8326. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8327. ext_factor, attn_factor, beta_fast, beta_slow
  8328. );
  8329. Kcur = ggml_rope_ext(
  8330. ctx0, Kcur, inp_pos, nullptr,
  8331. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8332. ext_factor, attn_factor, beta_fast, beta_slow
  8333. );
  8334. cb(Qcur, "Qcur", il);
  8335. cb(Kcur, "Kcur", il);
  8336. cb(Vcur, "Vcur", il);
  8337. cur = build_attn(inp_attn, gf,
  8338. model.layers[il].wo, NULL,
  8339. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  8340. }
  8341. if (il == n_layer - 1) {
  8342. // skip computing output for unused tokens
  8343. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8344. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8345. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  8346. }
  8347. // Add the input
  8348. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  8349. cb(ffn_inp, "ffn_inp", il);
  8350. // FF
  8351. {
  8352. cur = build_norm(ffn_inp,
  8353. model.layers[il].ffn_norm,
  8354. NULL,
  8355. LLM_NORM_RMS, il);
  8356. cb(cur, "ffn_norm", il);
  8357. cur = build_ffn(cur,
  8358. model.layers[il].ffn_up, NULL, NULL,
  8359. NULL, NULL, NULL,
  8360. model.layers[il].ffn_down, NULL, NULL,
  8361. NULL,
  8362. LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
  8363. cb(cur, "ffn_out", il);
  8364. }
  8365. inpL = ggml_add(ctx0, cur, ffn_inp);
  8366. cb(inpL, "l_out", il);
  8367. }
  8368. cur = build_norm(inpL,
  8369. model.output_norm,
  8370. NULL,
  8371. LLM_NORM_RMS, -1);
  8372. cb(cur, "result_norm", -1);
  8373. res->t_embd = cur;
  8374. cur = build_lora_mm(model.output, cur);
  8375. cb(cur, "result_output", -1);
  8376. res->t_logits = cur;
  8377. ggml_build_forward_expand(gf, cur);
  8378. }
  8379. };
  8380. struct llm_build_nemotron : public llm_graph_context {
  8381. llm_build_nemotron(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  8382. const int64_t n_embd_head = hparams.n_embd_head_v;
  8383. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8384. //GGML_ASSERT(n_embd_head == hparams.n_rot);
  8385. ggml_tensor * cur;
  8386. ggml_tensor * inpL;
  8387. inpL = build_inp_embd(model.tok_embd);
  8388. // inp_pos - contains the positions
  8389. ggml_tensor * inp_pos = build_inp_pos();
  8390. auto * inp_attn = build_attn_inp_kv_unified();
  8391. for (int il = 0; il < n_layer; ++il) {
  8392. ggml_tensor * inpSA = inpL;
  8393. // norm
  8394. cur = build_norm(inpL,
  8395. model.layers[il].attn_norm,
  8396. model.layers[il].attn_norm_b,
  8397. LLM_NORM, il);
  8398. cb(cur, "attn_norm", il);
  8399. // self-attention
  8400. {
  8401. // compute Q and K and RoPE them
  8402. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8403. cb(Qcur, "Qcur", il);
  8404. if (model.layers[il].bq) {
  8405. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  8406. cb(Qcur, "Qcur", il);
  8407. }
  8408. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8409. cb(Kcur, "Kcur", il);
  8410. if (model.layers[il].bk) {
  8411. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  8412. cb(Kcur, "Kcur", il);
  8413. }
  8414. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8415. cb(Vcur, "Vcur", il);
  8416. if (model.layers[il].bv) {
  8417. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  8418. cb(Vcur, "Vcur", il);
  8419. }
  8420. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8421. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8422. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8423. Qcur = ggml_rope_ext(
  8424. ctx0, Qcur, inp_pos, nullptr,
  8425. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8426. ext_factor, attn_factor, beta_fast, beta_slow
  8427. );
  8428. Kcur = ggml_rope_ext(
  8429. ctx0, Kcur, inp_pos, nullptr,
  8430. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8431. ext_factor, attn_factor, beta_fast, beta_slow
  8432. );
  8433. cb(Qcur, "Qcur", il);
  8434. cb(Kcur, "Kcur", il);
  8435. cb(Vcur, "Vcur", il);
  8436. cur = build_attn(inp_attn, gf,
  8437. model.layers[il].wo, model.layers[il].bo,
  8438. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  8439. }
  8440. if (il == n_layer - 1) {
  8441. // skip computing output for unused tokens
  8442. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8443. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8444. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  8445. }
  8446. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  8447. cb(ffn_inp, "ffn_inp", il);
  8448. // feed-forward network
  8449. cur = build_norm(ffn_inp,
  8450. model.layers[il].ffn_norm,
  8451. model.layers[il].ffn_norm_b,
  8452. LLM_NORM, il);
  8453. cb(cur, "ffn_norm", il);
  8454. cur = build_ffn(cur,
  8455. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  8456. NULL, NULL, NULL,
  8457. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  8458. NULL,
  8459. LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
  8460. cur = ggml_add(ctx0, cur, ffn_inp);
  8461. cb(cur, "ffn_out", il);
  8462. cur = build_cvec(cur, il);
  8463. cb(cur, "l_out", il);
  8464. // input for next layer
  8465. inpL = cur;
  8466. }
  8467. cur = inpL;
  8468. cur = build_norm(cur,
  8469. model.output_norm, model.output_norm_b,
  8470. LLM_NORM, -1);
  8471. cb(cur, "result_norm", -1);
  8472. res->t_embd = cur;
  8473. // lm_head
  8474. cur = build_lora_mm(model.output, cur);
  8475. cb(cur, "result_output", -1);
  8476. res->t_logits = cur;
  8477. ggml_build_forward_expand(gf, cur);
  8478. }
  8479. };
  8480. struct llm_build_exaone : public llm_graph_context {
  8481. llm_build_exaone(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  8482. const int64_t n_embd_head = hparams.n_embd_head_v;
  8483. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8484. GGML_ASSERT(n_embd_head == hparams.n_rot);
  8485. ggml_tensor * cur;
  8486. ggml_tensor * inpL;
  8487. inpL = build_inp_embd(model.tok_embd);
  8488. // inp_pos - contains the positions
  8489. ggml_tensor * inp_pos = build_inp_pos();
  8490. auto * inp_attn = build_attn_inp_kv_unified();
  8491. for (int il = 0; il < n_layer; ++il) {
  8492. ggml_tensor * inpSA = inpL;
  8493. // norm
  8494. cur = build_norm(inpL,
  8495. model.layers[il].attn_norm, NULL,
  8496. LLM_NORM_RMS, il);
  8497. cb(cur, "attn_norm", il);
  8498. // self-attention
  8499. {
  8500. // rope freq factors for llama3; may return nullptr for llama2 and other models
  8501. ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
  8502. // compute Q and K and RoPE them
  8503. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8504. cb(Qcur, "Qcur", il);
  8505. if (model.layers[il].bq) {
  8506. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  8507. cb(Qcur, "Qcur", il);
  8508. }
  8509. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8510. cb(Kcur, "Kcur", il);
  8511. if (model.layers[il].bk) {
  8512. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  8513. cb(Kcur, "Kcur", il);
  8514. }
  8515. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8516. cb(Vcur, "Vcur", il);
  8517. if (model.layers[il].bv) {
  8518. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  8519. cb(Vcur, "Vcur", il);
  8520. }
  8521. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8522. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8523. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8524. Qcur = ggml_rope_ext(
  8525. ctx0, Qcur, inp_pos, rope_factors,
  8526. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8527. ext_factor, attn_factor, beta_fast, beta_slow
  8528. );
  8529. Kcur = ggml_rope_ext(
  8530. ctx0, Kcur, inp_pos, rope_factors,
  8531. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8532. ext_factor, attn_factor, beta_fast, beta_slow
  8533. );
  8534. cb(Qcur, "Qcur", il);
  8535. cb(Kcur, "Kcur", il);
  8536. cb(Vcur, "Vcur", il);
  8537. cur = build_attn(inp_attn, gf,
  8538. model.layers[il].wo, model.layers[il].bo,
  8539. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  8540. }
  8541. if (il == n_layer - 1) {
  8542. // skip computing output for unused tokens
  8543. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8544. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8545. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  8546. }
  8547. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  8548. cb(ffn_inp, "ffn_inp", il);
  8549. // feed-forward network
  8550. cur = build_norm(ffn_inp,
  8551. model.layers[il].ffn_norm, NULL,
  8552. LLM_NORM_RMS, il);
  8553. cb(cur, "ffn_norm", il);
  8554. cur = build_ffn(cur,
  8555. model.layers[il].ffn_up, NULL, NULL,
  8556. model.layers[il].ffn_gate, NULL, NULL,
  8557. model.layers[il].ffn_down, NULL, NULL,
  8558. NULL,
  8559. LLM_FFN_SILU, LLM_FFN_PAR, il);
  8560. cb(cur, "ffn_out", il);
  8561. cur = ggml_add(ctx0, cur, ffn_inp);
  8562. cb(cur, "ffn_out", il);
  8563. cur = build_cvec(cur, il);
  8564. cb(cur, "l_out", il);
  8565. // input for next layer
  8566. inpL = cur;
  8567. }
  8568. cur = inpL;
  8569. cur = build_norm(cur,
  8570. model.output_norm, NULL,
  8571. LLM_NORM_RMS, -1);
  8572. cb(cur, "result_norm", -1);
  8573. res->t_embd = cur;
  8574. // lm_head
  8575. cur = build_lora_mm(model.output, cur);
  8576. cb(cur, "result_output", -1);
  8577. res->t_logits = cur;
  8578. ggml_build_forward_expand(gf, cur);
  8579. }
  8580. };
  8581. struct llm_build_rwkv6_base : public llm_graph_context {
  8582. const llama_model & model;
  8583. llm_build_rwkv6_base(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
  8584. }
  8585. ggml_tensor * build_rwkv6_channel_mix(
  8586. const llama_layer * layer,
  8587. ggml_tensor * cur,
  8588. ggml_tensor * x_prev,
  8589. llm_arch arch) const {
  8590. ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
  8591. switch (arch) {
  8592. case LLM_ARCH_RWKV6:
  8593. {
  8594. ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
  8595. ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur);
  8596. ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr));
  8597. ggml_tensor * k = ggml_sqr(
  8598. ctx0,
  8599. ggml_relu(
  8600. ctx0,
  8601. build_lora_mm(layer->channel_mix_key, xk)
  8602. )
  8603. );
  8604. cur = ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k));
  8605. } break;
  8606. default:
  8607. GGML_ABORT("fatal error");
  8608. }
  8609. return cur;
  8610. }
  8611. ggml_tensor * build_rwkv6_time_mix(
  8612. ggml_cgraph * gf,
  8613. ggml_tensor * cur,
  8614. ggml_tensor * x_prev,
  8615. ggml_tensor * state_copy,
  8616. ggml_tensor * state_mask,
  8617. const llama_ubatch & ubatch,
  8618. int il) const {
  8619. const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
  8620. const auto n_tokens = ubatch.n_tokens;
  8621. const auto n_seqs = ubatch.n_seqs;
  8622. const auto n_seq_tokens = ubatch.n_seq_tokens;
  8623. const auto n_embd = hparams.n_embd;
  8624. const auto head_size = hparams.wkv_head_size;
  8625. const auto n_head = n_embd / head_size;
  8626. const auto n_head_kv = hparams.n_head_kv(il);
  8627. const auto kv_head = kv_self->head;
  8628. const auto & layer = model.layers[il];
  8629. bool is_qrwkv = layer.time_mix_first == nullptr;
  8630. ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
  8631. sx = ggml_reshape_2d(ctx0, sx, n_embd, n_tokens);
  8632. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  8633. ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_x), cur);
  8634. xxx = ggml_reshape_4d(
  8635. ctx0,
  8636. ggml_tanh(
  8637. ctx0,
  8638. ggml_mul_mat(ctx0, layer.time_mix_w1, xxx)
  8639. ),
  8640. layer.time_mix_w1->ne[1] / 5, 1, 5, n_tokens
  8641. );
  8642. xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2));
  8643. xxx = ggml_mul_mat(
  8644. ctx0,
  8645. ggml_reshape_4d(
  8646. ctx0,
  8647. layer.time_mix_w2,
  8648. layer.time_mix_w2->ne[0], layer.time_mix_w2->ne[1], 1, 5
  8649. ),
  8650. xxx
  8651. );
  8652. ggml_tensor *xw, *xk, *xv, *xr, *xg;
  8653. if (layer.time_mix_lerp_fused) {
  8654. // fusing these weights makes some performance improvement
  8655. sx = ggml_reshape_3d(ctx0, sx, n_embd, 1, n_tokens);
  8656. cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
  8657. xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer.time_mix_lerp_fused), sx), cur);
  8658. xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
  8659. xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
  8660. xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
  8661. xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
  8662. xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
  8663. } else {
  8664. // for backward compatibility
  8665. xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
  8666. xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
  8667. xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
  8668. xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
  8669. xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
  8670. xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer.time_mix_lerp_w), sx), cur);
  8671. xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer.time_mix_lerp_k), sx), cur);
  8672. xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer.time_mix_lerp_v), sx), cur);
  8673. xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer.time_mix_lerp_r), sx), cur);
  8674. xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer.time_mix_lerp_g), sx), cur);
  8675. }
  8676. ggml_tensor * r = build_lora_mm(layer.time_mix_receptance, xr);
  8677. ggml_tensor * k = build_lora_mm(layer.time_mix_key, xk);
  8678. ggml_tensor * v = build_lora_mm(layer.time_mix_value, xv);
  8679. if (layer.time_mix_receptance_b) {
  8680. r = ggml_add(ctx0, r, layer.time_mix_receptance_b);
  8681. }
  8682. if (layer.time_mix_key_b) {
  8683. k = ggml_add(ctx0, k, layer.time_mix_key_b);
  8684. }
  8685. if (layer.time_mix_value_b) {
  8686. v = ggml_add(ctx0, v, layer.time_mix_value_b);
  8687. }
  8688. ggml_tensor * g = build_lora_mm(layer.time_mix_gate, xg);
  8689. if (is_qrwkv) {
  8690. g = ggml_sigmoid(ctx0, g);
  8691. } else {
  8692. g = ggml_silu(ctx0, g);
  8693. }
  8694. if (n_head_kv != 0 && n_head_kv != n_head) {
  8695. GGML_ASSERT(n_head % n_head_kv == 0);
  8696. k = ggml_reshape_4d(ctx0, k, head_size, 1, n_head_kv, n_tokens);
  8697. v = ggml_reshape_4d(ctx0, v, head_size, 1, n_head_kv, n_tokens);
  8698. ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, n_head / n_head_kv, n_head_kv, n_tokens);
  8699. k = ggml_repeat(ctx0, k, tmp);
  8700. v = ggml_repeat(ctx0, v, tmp);
  8701. }
  8702. k = ggml_reshape_3d(ctx0, k, head_size, n_head, n_tokens);
  8703. v = ggml_reshape_3d(ctx0, v, head_size, n_head, n_tokens);
  8704. r = ggml_reshape_3d(ctx0, r, head_size, n_head, n_tokens);
  8705. ggml_tensor * w = ggml_mul_mat(
  8706. ctx0,
  8707. layer.time_mix_decay_w2,
  8708. ggml_tanh(
  8709. ctx0,
  8710. ggml_mul_mat(ctx0, layer.time_mix_decay_w1, xw)
  8711. )
  8712. );
  8713. w = ggml_add(ctx0, w, layer.time_mix_decay);
  8714. w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w)));
  8715. w = ggml_reshape_3d(ctx0, w, head_size, n_head, n_tokens);
  8716. if (is_qrwkv) {
  8717. // k = k * (1 - w)
  8718. k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
  8719. }
  8720. ggml_tensor * wkv_state = build_copy_mask_state(
  8721. gf, kv_self->v_l[il], state_copy, state_mask,
  8722. hparams.n_embd_v_s(), n_seqs);
  8723. ggml_tensor * wkv_output;
  8724. if (is_qrwkv) {
  8725. wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, wkv_state, pow(head_size, -0.5f));
  8726. } else {
  8727. wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer.time_mix_first, w, wkv_state);
  8728. }
  8729. cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
  8730. wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
  8731. ggml_build_forward_expand(
  8732. gf,
  8733. ggml_cpy(
  8734. ctx0,
  8735. wkv_state,
  8736. ggml_view_1d(
  8737. ctx0,
  8738. kv_self->v_l[il],
  8739. hparams.n_embd_v_s() * n_seqs,
  8740. hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self->v_l[il])
  8741. )
  8742. )
  8743. );
  8744. if (!is_qrwkv) {
  8745. // group norm with head_count groups
  8746. cur = ggml_reshape_3d(ctx0, cur, n_embd / n_head, n_head, n_tokens);
  8747. cur = ggml_norm(ctx0, cur, 64e-5f);
  8748. // Convert back to regular vectors.
  8749. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  8750. cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.time_mix_ln), layer.time_mix_ln_b);
  8751. } else {
  8752. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  8753. }
  8754. cur = ggml_mul(ctx0, cur, g);
  8755. cur = build_lora_mm(layer.time_mix_output, cur);
  8756. return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
  8757. }
  8758. };
  8759. struct llm_build_rwkv6 : public llm_build_rwkv6_base {
  8760. llm_build_rwkv6(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) {
  8761. GGML_ASSERT(hparams.token_shift_count == 2);
  8762. ggml_tensor * cur;
  8763. ggml_tensor * inpL;
  8764. inpL = build_inp_embd(model.tok_embd);
  8765. inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
  8766. ggml_tensor * state_copy = build_inp_s_copy();
  8767. ggml_tensor * state_mask = build_inp_s_mask();
  8768. const auto n_embd = hparams.n_embd;
  8769. const auto n_seq_tokens = ubatch.n_seq_tokens;
  8770. const auto n_seqs = ubatch.n_seqs;
  8771. for (int il = 0; il < n_layer; ++il) {
  8772. const llama_layer * layer = &model.layers[il];
  8773. inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
  8774. ggml_tensor * token_shift = build_rwkv_token_shift_load(
  8775. gf, state_copy, state_mask, ubatch, il
  8776. );
  8777. ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
  8778. ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
  8779. ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
  8780. cb(att_norm, "attn_norm", il);
  8781. ggml_tensor * x_prev = ggml_concat(
  8782. ctx0,
  8783. att_shift,
  8784. ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
  8785. 1
  8786. );
  8787. cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
  8788. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  8789. cb(ffn_inp, "ffn_inp", il);
  8790. ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
  8791. cb(ffn_norm, "ffn_norm", il);
  8792. x_prev = ggml_concat(
  8793. ctx0,
  8794. ffn_shift,
  8795. ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0),
  8796. 1
  8797. );
  8798. token_shift = ggml_concat(ctx0,
  8799. ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)),
  8800. ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
  8801. 1
  8802. );
  8803. ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
  8804. if (il == n_layer - 1) {
  8805. // skip computing output for unused tokens
  8806. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  8807. ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
  8808. ffn_norm = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens), inp_out_ids);
  8809. x_prev = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens), inp_out_ids);
  8810. cur = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens), inp_out_ids);
  8811. }
  8812. cur = build_rwkv6_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6);
  8813. cur = ggml_add(ctx0, cur, ffn_inp);
  8814. if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
  8815. cur = ggml_scale(ctx0, cur, 0.5F);
  8816. }
  8817. cur = build_cvec(cur, il);
  8818. cb(cur, "l_out", il);
  8819. // input for next layer
  8820. inpL = cur;
  8821. }
  8822. cur = inpL;
  8823. cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
  8824. cb(cur, "result_norm", -1);
  8825. res->t_embd = cur;
  8826. cur = build_lora_mm(model.output, cur);
  8827. cb(cur, "result_output", -1);
  8828. res->t_logits = cur;
  8829. ggml_build_forward_expand(gf, cur);
  8830. }
  8831. };
  8832. // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
  8833. struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
  8834. llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) {
  8835. GGML_ASSERT(n_embd == hparams.n_embd_k_s());
  8836. ggml_tensor * cur;
  8837. ggml_tensor * inpL;
  8838. inpL = build_inp_embd(model.tok_embd);
  8839. ggml_tensor * state_copy = build_inp_s_copy();
  8840. ggml_tensor * state_mask = build_inp_s_mask();
  8841. const auto n_embd = hparams.n_embd;
  8842. const auto n_seq_tokens = ubatch.n_seq_tokens;
  8843. const auto n_seqs = ubatch.n_seqs;
  8844. for (int il = 0; il < n_layer; ++il) {
  8845. const llama_layer * layer = &model.layers[il];
  8846. inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
  8847. ggml_tensor * token_shift = build_rwkv_token_shift_load(
  8848. gf, state_copy, state_mask, ubatch, il
  8849. );
  8850. ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
  8851. cb(att_norm, "attn_norm", il);
  8852. ggml_tensor * x_prev = ggml_concat(
  8853. ctx0,
  8854. token_shift,
  8855. ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
  8856. 1
  8857. );
  8858. cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
  8859. token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
  8860. ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
  8861. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  8862. cb(ffn_inp, "ffn_inp", il);
  8863. if (il == n_layer - 1) {
  8864. // skip computing output for unused tokens
  8865. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  8866. cur = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens), inp_out_ids);
  8867. ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
  8868. }
  8869. // feed-forward network
  8870. cur = build_norm(ffn_inp,
  8871. model.layers[il].ffn_norm, NULL,
  8872. LLM_NORM_RMS, il);
  8873. cb(cur, "ffn_norm", il);
  8874. cur = build_ffn(cur,
  8875. model.layers[il].ffn_up, NULL, NULL,
  8876. model.layers[il].ffn_gate, NULL, NULL,
  8877. model.layers[il].ffn_down, NULL, NULL,
  8878. NULL,
  8879. LLM_FFN_SILU, LLM_FFN_PAR, il);
  8880. cb(cur, "ffn_out", il);
  8881. cur = ggml_add(ctx0, cur, ffn_inp);
  8882. cur = build_cvec(cur, il);
  8883. cb(cur, "l_out", il);
  8884. // input for next layer
  8885. inpL = cur;
  8886. }
  8887. cur = inpL;
  8888. cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
  8889. cb(cur, "result_norm", -1);
  8890. res->t_embd = cur;
  8891. cur = build_lora_mm(model.output, cur);
  8892. cb(cur, "result_output", -1);
  8893. res->t_logits = cur;
  8894. ggml_build_forward_expand(gf, cur);
  8895. }
  8896. };
  8897. struct llm_build_rwkv7_base : public llm_graph_context {
  8898. const llama_model & model;
  8899. llm_build_rwkv7_base(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
  8900. }
  8901. ggml_tensor * build_rwkv7_channel_mix(
  8902. const llama_layer * layer,
  8903. ggml_tensor * cur,
  8904. ggml_tensor * x_prev,
  8905. llm_arch arch) const {
  8906. ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
  8907. switch (arch) {
  8908. case LLM_ARCH_RWKV7:
  8909. {
  8910. ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
  8911. ggml_tensor * k = ggml_sqr(
  8912. ctx0,
  8913. ggml_relu(
  8914. ctx0,
  8915. build_lora_mm(layer->channel_mix_key, xk)
  8916. )
  8917. );
  8918. cur = build_lora_mm(layer->channel_mix_value, k);
  8919. } break;
  8920. default:
  8921. GGML_ABORT("fatal error");
  8922. }
  8923. return cur;
  8924. }
  8925. ggml_tensor * build_rwkv7_time_mix(
  8926. ggml_cgraph * gf,
  8927. ggml_tensor * cur,
  8928. ggml_tensor * x_prev,
  8929. ggml_tensor * state_copy,
  8930. ggml_tensor * state_mask,
  8931. ggml_tensor *& first_layer_value,
  8932. const llama_ubatch & ubatch,
  8933. int il) const {
  8934. const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
  8935. const auto n_tokens = ubatch.n_tokens;
  8936. const auto n_seqs = ubatch.n_seqs;
  8937. const auto n_embd = hparams.n_embd;
  8938. const auto head_size = hparams.wkv_head_size;
  8939. const auto head_count = n_embd / head_size;
  8940. const auto n_seq_tokens = ubatch.n_seq_tokens;
  8941. const auto kv_head = kv_self->head;
  8942. const auto & layer = model.layers[il];
  8943. bool has_gating = layer.time_mix_g1 && layer.time_mix_g2;
  8944. ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
  8945. ggml_tensor * dummy = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_embd, n_seq_tokens, n_seqs, has_gating ? 6 : 5);
  8946. sx = ggml_repeat(ctx0, sx, dummy);
  8947. ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_fused), cur);
  8948. ggml_tensor * xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
  8949. ggml_tensor * xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
  8950. ggml_tensor * xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
  8951. ggml_tensor * xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
  8952. ggml_tensor * xa = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
  8953. ggml_tensor * xg = has_gating ? ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 5 * sizeof(float)) : nullptr;
  8954. ggml_tensor * r = build_lora_mm(layer.time_mix_receptance, xr);
  8955. ggml_tensor * w = ggml_add(
  8956. ctx0,
  8957. ggml_mul_mat(ctx0, layer.time_mix_w2, ggml_tanh(ctx0, ggml_mul_mat(ctx0, layer.time_mix_w1, xw))),
  8958. layer.time_mix_w0
  8959. );
  8960. w = ggml_exp(ctx0, ggml_scale(ctx0, ggml_sigmoid(ctx0, w), -0.606531));
  8961. ggml_tensor * k = build_lora_mm(layer.time_mix_key, xk);
  8962. ggml_tensor * v = build_lora_mm(layer.time_mix_value, xv);
  8963. if (first_layer_value == nullptr) {
  8964. first_layer_value = v;
  8965. } else {
  8966. // Add the first layer value as a residual connection.
  8967. v = ggml_add(ctx0, v,
  8968. ggml_mul(ctx0,
  8969. ggml_sub(ctx0, first_layer_value, v),
  8970. ggml_sigmoid(ctx0, ggml_add(ctx0,
  8971. ggml_mul_mat(ctx0, layer.time_mix_v2, ggml_mul_mat(ctx0, layer.time_mix_v1, xv)),
  8972. layer.time_mix_v0
  8973. )
  8974. )
  8975. )
  8976. );
  8977. }
  8978. ggml_tensor * g = nullptr;
  8979. if (layer.time_mix_g1 && layer.time_mix_g2) {
  8980. g = ggml_mul_mat(ctx0, layer.time_mix_g2, ggml_sigmoid(ctx0, ggml_mul_mat(ctx0, layer.time_mix_g1, xg)));
  8981. }
  8982. ggml_tensor * a = ggml_sigmoid(ctx0,
  8983. ggml_add(
  8984. ctx0,
  8985. ggml_mul_mat(ctx0, layer.time_mix_a2, ggml_mul_mat(ctx0, layer.time_mix_a1, xa)),
  8986. layer.time_mix_a0
  8987. )
  8988. );
  8989. ggml_tensor * kk = ggml_reshape_3d(ctx0, ggml_mul(ctx0, k, layer.time_mix_k_k), head_size, head_count, n_tokens);
  8990. kk = ggml_l2_norm(ctx0, kk, 1e-12);
  8991. ggml_tensor * ka = ggml_mul(ctx0, k, layer.time_mix_k_a);
  8992. k = ggml_add(ctx0, k, ggml_sub(ctx0, ggml_mul(ctx0, a, ka), ka));
  8993. r = ggml_reshape_3d(ctx0, r, head_size, head_count, n_tokens);
  8994. w = ggml_reshape_3d(ctx0, w, head_size, head_count, n_tokens);
  8995. k = ggml_reshape_3d(ctx0, k, head_size, head_count, n_tokens);
  8996. v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
  8997. a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
  8998. ggml_tensor * wkv_state = build_copy_mask_state(
  8999. gf, kv_self->v_l[il], state_copy, state_mask,
  9000. hparams.n_embd_v_s(), n_seqs);
  9001. ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
  9002. cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
  9003. wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
  9004. ggml_build_forward_expand(
  9005. gf,
  9006. ggml_cpy(
  9007. ctx0,
  9008. wkv_state,
  9009. ggml_view_1d(
  9010. ctx0,
  9011. kv_self->v_l[il],
  9012. hparams.n_embd_v_s() * n_seqs,
  9013. hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self->v_l[il])
  9014. )
  9015. )
  9016. );
  9017. if (layer.time_mix_ln && layer.time_mix_ln_b) {
  9018. // group norm with head_count groups
  9019. cur = ggml_reshape_3d(ctx0, cur, n_embd / head_count, head_count, n_tokens);
  9020. cur = ggml_norm(ctx0, cur, 64e-5f);
  9021. // Convert back to regular vectors.
  9022. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  9023. cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.time_mix_ln), layer.time_mix_ln_b);
  9024. } else {
  9025. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  9026. }
  9027. ggml_tensor * rk = ggml_sum_rows(ctx0,
  9028. ggml_mul(ctx0, ggml_mul(ctx0, k, r), ggml_reshape_2d(ctx0, layer.time_mix_r_k, head_size, head_count)));
  9029. cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, ggml_mul(ctx0, v, rk), n_embd, n_tokens));
  9030. if (has_gating) {
  9031. cur = ggml_mul(ctx0, cur, g);
  9032. }
  9033. cur = build_lora_mm(layer.time_mix_output, cur);
  9034. return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
  9035. }
  9036. };
  9037. struct llm_build_rwkv7 : public llm_build_rwkv7_base {
  9038. llm_build_rwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) {
  9039. GGML_ASSERT(hparams.token_shift_count == 2);
  9040. ggml_tensor * cur;
  9041. ggml_tensor * inpL;
  9042. ggml_tensor * v_first = nullptr;
  9043. inpL = build_inp_embd(model.tok_embd);
  9044. inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
  9045. ggml_tensor * state_copy = build_inp_s_copy();
  9046. ggml_tensor * state_mask = build_inp_s_mask();
  9047. const auto n_embd = hparams.n_embd;
  9048. const auto n_seq_tokens = ubatch.n_seq_tokens;
  9049. const auto n_seqs = ubatch.n_seqs;
  9050. for (int il = 0; il < n_layer; ++il) {
  9051. const llama_layer * layer = &model.layers[il];
  9052. inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
  9053. ggml_tensor * token_shift = build_rwkv_token_shift_load(
  9054. gf, state_copy, state_mask, ubatch, il
  9055. );
  9056. ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
  9057. ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
  9058. ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
  9059. cb(att_norm, "attn_norm", il);
  9060. ggml_tensor * x_prev = ggml_concat(
  9061. ctx0,
  9062. att_shift,
  9063. ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
  9064. 1
  9065. );
  9066. cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il);
  9067. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  9068. cb(ffn_inp, "ffn_inp", il);
  9069. ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
  9070. cb(ffn_norm, "ffn_norm", il);
  9071. x_prev = ggml_concat(
  9072. ctx0,
  9073. ffn_shift,
  9074. ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0),
  9075. 1
  9076. );
  9077. token_shift = ggml_concat(ctx0,
  9078. ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)),
  9079. ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
  9080. 1
  9081. );
  9082. ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
  9083. if (il == n_layer - 1) {
  9084. // skip computing output for unused tokens
  9085. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  9086. ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
  9087. ffn_norm = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens), inp_out_ids);
  9088. x_prev = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens), inp_out_ids);
  9089. }
  9090. cur = build_rwkv7_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV7);
  9091. cur = ggml_add(ctx0, cur, ffn_inp);
  9092. cur = build_cvec(cur, il);
  9093. cb(cur, "l_out", il);
  9094. // input for next layer
  9095. inpL = cur;
  9096. }
  9097. cur = inpL;
  9098. cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
  9099. cb(cur, "result_norm", -1);
  9100. res->t_embd = cur;
  9101. cur = build_lora_mm(model.output, cur);
  9102. cb(cur, "result_output", -1);
  9103. res->t_logits = cur;
  9104. ggml_build_forward_expand(gf, cur);
  9105. }
  9106. };
  9107. struct llm_build_arwkv7 : public llm_build_rwkv7_base {
  9108. llm_build_arwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) {
  9109. GGML_ASSERT(n_embd == hparams.n_embd_k_s());
  9110. ggml_tensor * cur;
  9111. ggml_tensor * inpL;
  9112. ggml_tensor * v_first = nullptr;
  9113. inpL = build_inp_embd(model.tok_embd);
  9114. ggml_tensor * state_copy = build_inp_s_copy();
  9115. ggml_tensor * state_mask = build_inp_s_mask();
  9116. const auto n_embd = hparams.n_embd;
  9117. const auto n_seq_tokens = ubatch.n_seq_tokens;
  9118. const auto n_seqs = ubatch.n_seqs;
  9119. for (int il = 0; il < n_layer; ++il) {
  9120. const llama_layer * layer = &model.layers[il];
  9121. inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
  9122. ggml_tensor * token_shift = build_rwkv_token_shift_load(
  9123. gf, state_copy, state_mask, ubatch, il
  9124. );
  9125. ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
  9126. cb(att_norm, "attn_norm", il);
  9127. ggml_tensor * x_prev = ggml_concat(
  9128. ctx0,
  9129. token_shift,
  9130. ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
  9131. 1
  9132. );
  9133. cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il);
  9134. token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
  9135. ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
  9136. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  9137. cb(ffn_inp, "ffn_inp", il);
  9138. if (il == n_layer - 1) {
  9139. // skip computing output for unused tokens
  9140. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  9141. cur = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens), inp_out_ids);
  9142. ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
  9143. }
  9144. // feed-forward network
  9145. cur = build_norm(ffn_inp,
  9146. model.layers[il].ffn_norm, NULL,
  9147. LLM_NORM_RMS, il);
  9148. cb(cur, "ffn_norm", il);
  9149. cur = build_ffn(cur,
  9150. model.layers[il].ffn_up, NULL, NULL,
  9151. model.layers[il].ffn_gate, NULL, NULL,
  9152. model.layers[il].ffn_down, NULL, NULL,
  9153. NULL,
  9154. LLM_FFN_SILU, LLM_FFN_PAR, il);
  9155. cb(cur, "ffn_out", il);
  9156. cur = ggml_add(ctx0, cur, ffn_inp);
  9157. cur = build_cvec(cur, il);
  9158. cb(cur, "l_out", il);
  9159. // input for next layer
  9160. inpL = cur;
  9161. }
  9162. cur = inpL;
  9163. cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
  9164. cb(cur, "result_norm", -1);
  9165. res->t_embd = cur;
  9166. cur = build_lora_mm(model.output, cur);
  9167. cb(cur, "result_output", -1);
  9168. res->t_logits = cur;
  9169. ggml_build_forward_expand(gf, cur);
  9170. }
  9171. };
  9172. // ref: https://github.com/facebookresearch/chameleon
  9173. // based on the original build_llama() function, changes:
  9174. // * qk-norm
  9175. // * swin-norm
  9176. // * removed bias
  9177. // * removed MoE
  9178. struct llm_build_chameleon : public llm_graph_context {
  9179. llm_build_chameleon(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  9180. const int64_t n_embd_head = hparams.n_embd_head_v;
  9181. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9182. GGML_ASSERT(n_embd_head == hparams.n_rot);
  9183. ggml_tensor * cur;
  9184. ggml_tensor * inpL;
  9185. inpL = build_inp_embd(model.tok_embd);
  9186. // inp_pos - contains the positions
  9187. ggml_tensor * inp_pos = build_inp_pos();
  9188. auto * inp_attn = build_attn_inp_kv_unified();
  9189. for (int il = 0; il < n_layer; ++il) {
  9190. ggml_tensor * inpSA = inpL;
  9191. // norm
  9192. if (hparams.swin_norm) {
  9193. cur = inpL;
  9194. } else {
  9195. cur = build_norm(inpL,
  9196. model.layers[il].attn_norm, NULL,
  9197. LLM_NORM_RMS, il);
  9198. cb(cur, "attn_norm", il);
  9199. }
  9200. // self-attention
  9201. {
  9202. // compute Q and K and RoPE them
  9203. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  9204. cb(Qcur, "Qcur", il);
  9205. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  9206. cb(Kcur, "Kcur", il);
  9207. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  9208. cb(Vcur, "Vcur", il);
  9209. if (model.layers[il].attn_q_norm) {
  9210. Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
  9211. ggml_element_size(Qcur) * n_embd_head,
  9212. ggml_element_size(Qcur) * n_embd_head * n_head,
  9213. 0);
  9214. cb(Qcur, "Qcur", il);
  9215. Qcur = build_norm(Qcur,
  9216. model.layers[il].attn_q_norm,
  9217. model.layers[il].attn_q_norm_b,
  9218. LLM_NORM, il);
  9219. cb(Qcur, "Qcur", il);
  9220. }
  9221. if (model.layers[il].attn_k_norm) {
  9222. Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
  9223. ggml_element_size(Kcur) * n_embd_head,
  9224. ggml_element_size(Kcur) * n_embd_head * n_head_kv,
  9225. 0);
  9226. cb(Kcur, "Kcur", il);
  9227. Kcur = build_norm(Kcur,
  9228. model.layers[il].attn_k_norm,
  9229. model.layers[il].attn_k_norm_b,
  9230. LLM_NORM, il);
  9231. cb(Kcur, "Kcur", il);
  9232. }
  9233. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9234. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9235. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9236. Qcur = ggml_rope_ext(
  9237. ctx0, Qcur, inp_pos, nullptr,
  9238. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9239. ext_factor, attn_factor, beta_fast, beta_slow
  9240. );
  9241. Kcur = ggml_rope_ext(
  9242. ctx0, Kcur, inp_pos, nullptr,
  9243. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9244. ext_factor, attn_factor, beta_fast, beta_slow
  9245. );
  9246. cb(Qcur, "Qcur", il);
  9247. cb(Kcur, "Kcur", il);
  9248. cb(Vcur, "Vcur", il);
  9249. cur = build_attn(inp_attn, gf,
  9250. model.layers[il].wo, nullptr,
  9251. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  9252. if (hparams.swin_norm) {
  9253. cur = build_norm(cur,
  9254. model.layers[il].attn_norm, NULL,
  9255. LLM_NORM_RMS, il);
  9256. }
  9257. }
  9258. if (il == n_layer - 1) {
  9259. // skip computing output for unused tokens
  9260. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9261. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9262. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9263. }
  9264. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9265. cb(ffn_inp, "ffn_inp", il);
  9266. // feed-forward network
  9267. if (!hparams.swin_norm) {
  9268. cur = build_norm(ffn_inp,
  9269. model.layers[il].ffn_norm, NULL,
  9270. LLM_NORM_RMS, il);
  9271. cb(cur, "ffn_norm", il);
  9272. }
  9273. cur = build_ffn(cur,
  9274. model.layers[il].ffn_up, NULL, NULL,
  9275. model.layers[il].ffn_gate, NULL, NULL,
  9276. model.layers[il].ffn_down, NULL, NULL,
  9277. NULL,
  9278. LLM_FFN_SILU, LLM_FFN_PAR, il);
  9279. cb(cur, "ffn_out", il);
  9280. if (hparams.swin_norm) {
  9281. cur = build_norm(cur,
  9282. model.layers[il].ffn_norm, NULL,
  9283. LLM_NORM_RMS, il);
  9284. cb(cur, "ffn_norm", il);
  9285. }
  9286. cur = ggml_add(ctx0, cur, ffn_inp);
  9287. cb(cur, "ffn_out", il);
  9288. cur = build_cvec(cur, il);
  9289. cb(cur, "l_out", il);
  9290. // input for next layer
  9291. inpL = cur;
  9292. }
  9293. cur = inpL;
  9294. cur = build_norm(cur,
  9295. model.output_norm, NULL,
  9296. LLM_NORM_RMS, -1);
  9297. cb(cur, "result_norm", -1);
  9298. res->t_embd = cur;
  9299. // lm_head
  9300. cur = build_lora_mm(model.output, cur);
  9301. cb(cur, "result_output_with_img_logits", -1);
  9302. // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
  9303. // Needs to be removed once image outputs are supported.
  9304. int img_token_end_idx = 8196;
  9305. int img_token_start_idx = 4;
  9306. int num_img_tokens = img_token_end_idx - img_token_start_idx;
  9307. // creates 1d tensor of size num_img_tokens and values -FLT_MAX,
  9308. // which ensures that text token values are always at least larger than image token values
  9309. ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
  9310. img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
  9311. cb(img_logits, "img_logits", -1);
  9312. cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
  9313. cb(cur, "result_output", -1);
  9314. res->t_logits = cur;
  9315. ggml_build_forward_expand(gf, cur);
  9316. }
  9317. };
  9318. struct llm_build_wavtokenizer_dec : public llm_graph_context {
  9319. llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  9320. ggml_tensor * cur;
  9321. ggml_tensor * inpL;
  9322. inpL = build_inp_embd(model.tok_embd);
  9323. cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
  9324. cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
  9325. cur = ggml_add(ctx0, cur, model.conv1d_b);
  9326. // posnet
  9327. for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
  9328. const auto & layer = model.layers[il].posnet;
  9329. inpL = cur;
  9330. switch (il) {
  9331. case 0:
  9332. case 1:
  9333. case 3:
  9334. case 4:
  9335. {
  9336. cur = build_norm(cur,
  9337. layer.norm1,
  9338. layer.norm1_b,
  9339. LLM_NORM_GROUP, 0);
  9340. cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
  9341. cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
  9342. cur = ggml_add(ctx0, cur, layer.conv1_b);
  9343. cur = build_norm(cur,
  9344. layer.norm2,
  9345. layer.norm2_b,
  9346. LLM_NORM_GROUP, 0);
  9347. cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
  9348. cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
  9349. cur = ggml_add(ctx0, cur, layer.conv2_b);
  9350. cur = ggml_add(ctx0, cur, inpL);
  9351. } break;
  9352. case 2:
  9353. {
  9354. cur = build_norm(cur,
  9355. layer.attn_norm,
  9356. layer.attn_norm_b,
  9357. LLM_NORM_GROUP, 0);
  9358. ggml_tensor * q;
  9359. ggml_tensor * k;
  9360. ggml_tensor * v;
  9361. q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
  9362. k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
  9363. v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
  9364. q = ggml_add(ctx0, q, layer.attn_q_b);
  9365. k = ggml_add(ctx0, k, layer.attn_k_b);
  9366. v = ggml_add(ctx0, v, layer.attn_v_b);
  9367. q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
  9368. k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
  9369. ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
  9370. kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
  9371. cur = ggml_mul_mat(ctx0, kq, v);
  9372. cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
  9373. cur = ggml_add(ctx0, cur, layer.attn_o_b);
  9374. cur = ggml_add(ctx0, cur, inpL);
  9375. } break;
  9376. case 5:
  9377. {
  9378. cur = build_norm(cur,
  9379. layer.norm,
  9380. layer.norm_b,
  9381. LLM_NORM_GROUP, 0);
  9382. } break;
  9383. default: GGML_ABORT("unknown posnet layer");
  9384. };
  9385. }
  9386. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  9387. cur = build_norm(cur,
  9388. model.tok_norm,
  9389. model.tok_norm_b,
  9390. LLM_NORM, -1);
  9391. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  9392. inpL = cur;
  9393. // convnext
  9394. for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
  9395. const auto & layer = model.layers[il].convnext;
  9396. cur = inpL;
  9397. cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
  9398. cur = ggml_add(ctx0, cur, layer.dw_b);
  9399. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  9400. cur = build_norm(cur,
  9401. layer.norm,
  9402. layer.norm_b,
  9403. LLM_NORM, -1);
  9404. cur = build_ffn(cur,
  9405. layer.pw1, layer.pw1_b, NULL,
  9406. NULL, NULL, NULL,
  9407. layer.pw2, layer.pw2_b, NULL,
  9408. NULL,
  9409. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  9410. cur = ggml_mul(ctx0, cur, layer.gamma);
  9411. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  9412. inpL = ggml_add(ctx0, cur, inpL);
  9413. }
  9414. cur = inpL;
  9415. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  9416. cur = build_norm(cur,
  9417. model.output_norm,
  9418. model.output_norm_b,
  9419. LLM_NORM, -1);
  9420. // lm_head
  9421. cur = build_lora_mm(model.output, cur);
  9422. cur = ggml_add(ctx0, cur, model.output_b);
  9423. cb(cur, "result_embd", -1);
  9424. res->t_embd = cur;
  9425. ggml_build_forward_expand(gf, cur);
  9426. }
  9427. };
  9428. struct llm_build_plm : public llm_graph_context {
  9429. llm_build_plm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  9430. const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
  9431. const uint32_t n_embd_head_qk_rope = hparams.n_rot;
  9432. const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  9433. const uint32_t kv_lora_rank = hparams.n_lora_kv;
  9434. ggml_tensor * cur;
  9435. ggml_tensor * inpL;
  9436. // {n_embd, n_tokens}
  9437. inpL = build_inp_embd(model.tok_embd);
  9438. // inp_pos - contains the positions
  9439. ggml_tensor * inp_pos = build_inp_pos();
  9440. auto * inp_attn = build_attn_inp_kv_unified();
  9441. for (int il = 0; il < n_layer; ++il) {
  9442. ggml_tensor * inpSA = inpL;
  9443. // norm
  9444. cur = build_norm(inpL,
  9445. model.layers[il].attn_norm, NULL,
  9446. LLM_NORM_RMS, il);
  9447. cb(cur, "attn_norm", il);
  9448. // self_attention
  9449. {
  9450. ggml_tensor * q = NULL;
  9451. q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
  9452. cb(q, "q", il);
  9453. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  9454. ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
  9455. ggml_row_size(q->type, hparams.n_embd_head_k),
  9456. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  9457. 0);
  9458. cb(q_nope, "q_nope", il);
  9459. // and {n_head * n_embd_head_qk_rope, n_tokens}
  9460. ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
  9461. ggml_row_size(q->type, hparams.n_embd_head_k),
  9462. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  9463. ggml_row_size(q->type, n_embd_head_qk_nope));
  9464. cb(q_pe, "q_pe", il);
  9465. // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
  9466. ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
  9467. cb(kv_pe_compresseed, "kv_pe_compresseed", il);
  9468. // split into {kv_lora_rank, n_tokens}
  9469. ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
  9470. kv_pe_compresseed->nb[1],
  9471. 0);
  9472. cb(kv_compressed, "kv_compressed", il);
  9473. // and {n_embd_head_qk_rope, n_tokens}
  9474. ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
  9475. kv_pe_compresseed->nb[1],
  9476. kv_pe_compresseed->nb[1],
  9477. ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
  9478. cb(k_pe, "k_pe", il);
  9479. kv_compressed = build_norm(kv_compressed,
  9480. model.layers[il].attn_kv_a_norm, NULL,
  9481. LLM_NORM_RMS, il);
  9482. cb(kv_compressed, "kv_compressed", il);
  9483. // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
  9484. ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
  9485. cb(kv, "kv", il);
  9486. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  9487. ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
  9488. ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
  9489. ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  9490. 0);
  9491. cb(k_nope, "k_nope", il);
  9492. // and {n_head * n_embd_head_v, n_tokens}
  9493. ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
  9494. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  9495. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
  9496. ggml_row_size(kv->type, (n_embd_head_qk_nope)));
  9497. cb(v_states, "v_states", il);
  9498. v_states = ggml_cont(ctx0, v_states);
  9499. cb(v_states, "v_states", il);
  9500. v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
  9501. ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
  9502. 0);
  9503. cb(v_states, "v_states", il);
  9504. q_pe = ggml_rope_ext(
  9505. ctx0, q_pe, inp_pos, nullptr,
  9506. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9507. ext_factor, attn_factor, beta_fast, beta_slow
  9508. );
  9509. cb(q_pe, "q_pe", il);
  9510. // shared RoPE key
  9511. k_pe = ggml_rope_ext(
  9512. ctx0, k_pe, inp_pos, nullptr,
  9513. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9514. ext_factor, attn_factor, beta_fast, beta_slow
  9515. );
  9516. cb(k_pe, "k_pe", il);
  9517. ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
  9518. cb(q_states, "q_states", il);
  9519. ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
  9520. cb(k_states, "k_states", il);
  9521. cur = build_attn(inp_attn, gf,
  9522. model.layers[il].wo, NULL,
  9523. q_states, k_states, v_states, nullptr, kq_scale, il);
  9524. }
  9525. if (il == n_layer - 1) {
  9526. // skip computing output for unused tokens
  9527. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9528. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9529. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9530. }
  9531. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9532. cb(ffn_inp, "ffn_inp", il);
  9533. cur = build_norm(ffn_inp,
  9534. model.layers[il].ffn_norm, NULL,
  9535. LLM_NORM_RMS, il);
  9536. cb(cur, "ffn_norm", il);
  9537. cur = build_ffn(cur,
  9538. model.layers[il].ffn_up, NULL, NULL,
  9539. NULL, NULL, NULL,
  9540. model.layers[il].ffn_down, NULL, NULL,
  9541. NULL,
  9542. LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
  9543. cb(cur, "ffn_out", il);
  9544. cur = ggml_add(ctx0, cur, ffn_inp);
  9545. cur = build_cvec(cur, il);
  9546. cb(cur, "l_out", il);
  9547. // input for next layer
  9548. inpL = cur;
  9549. }
  9550. cur = inpL;
  9551. cur = build_norm(cur,
  9552. model.output_norm, NULL,
  9553. LLM_NORM_RMS, -1);
  9554. cb(cur, "result_norm", -1);
  9555. res->t_embd = cur;
  9556. cur = build_lora_mm(model.output, cur);
  9557. cb(cur, "result_output", -1);
  9558. res->t_logits = cur;
  9559. ggml_build_forward_expand(gf, cur);
  9560. }
  9561. };
  9562. struct llm_build_bailingmoe : public llm_graph_context {
  9563. llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
  9564. ggml_tensor * cur;
  9565. ggml_tensor * inpL;
  9566. inpL = build_inp_embd(model.tok_embd);
  9567. // inp_pos - contains the positions
  9568. ggml_tensor * inp_pos = build_inp_pos();
  9569. auto * inp_attn = build_attn_inp_kv_unified();
  9570. for (int il = 0; il < n_layer; ++il) {
  9571. ggml_tensor * inpSA = inpL;
  9572. // norm
  9573. cur = build_norm(inpL,
  9574. model.layers[il].attn_norm, NULL,
  9575. LLM_NORM_RMS, il);
  9576. cb(cur, "attn_norm", il);
  9577. // self-attention
  9578. {
  9579. // rope freq factors for llama3; may return nullptr for llama2 and other models
  9580. ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
  9581. // compute Q and K and RoPE them
  9582. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  9583. cb(Qcur, "Qcur", il);
  9584. if (model.layers[il].bq) {
  9585. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  9586. cb(Qcur, "Qcur", il);
  9587. }
  9588. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  9589. cb(Kcur, "Kcur", il);
  9590. if (model.layers[il].bk) {
  9591. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  9592. cb(Kcur, "Kcur", il);
  9593. }
  9594. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  9595. cb(Vcur, "Vcur", il);
  9596. if (model.layers[il].bv) {
  9597. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  9598. cb(Vcur, "Vcur", il);
  9599. }
  9600. Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
  9601. Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
  9602. Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
  9603. Qcur = ggml_rope_ext(
  9604. ctx0, Qcur, inp_pos, rope_factors,
  9605. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9606. ext_factor, attn_factor, beta_fast, beta_slow
  9607. );
  9608. Kcur = ggml_rope_ext(
  9609. ctx0, Kcur, inp_pos, rope_factors,
  9610. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9611. ext_factor, attn_factor, beta_fast, beta_slow
  9612. );
  9613. cb(Qcur, "Qcur", il);
  9614. cb(Kcur, "Kcur", il);
  9615. cb(Vcur, "Vcur", il);
  9616. cur = build_attn(inp_attn, gf,
  9617. model.layers[il].wo, model.layers[il].bo,
  9618. Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_rot)), il);
  9619. }
  9620. if (il == n_layer - 1) {
  9621. // skip computing output for unused tokens
  9622. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9623. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9624. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9625. }
  9626. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9627. cb(ffn_inp, "ffn_inp", il);
  9628. cur = build_norm(ffn_inp,
  9629. model.layers[il].ffn_norm, NULL,
  9630. LLM_NORM_RMS, il);
  9631. cb(cur, "ffn_norm", il);
  9632. ggml_tensor * moe_out =
  9633. build_moe_ffn(cur,
  9634. model.layers[il].ffn_gate_inp,
  9635. model.layers[il].ffn_up_exps,
  9636. model.layers[il].ffn_gate_exps,
  9637. model.layers[il].ffn_down_exps,
  9638. nullptr,
  9639. n_expert, n_expert_used,
  9640. LLM_FFN_SILU, hparams.expert_weights_norm,
  9641. false, hparams.expert_weights_scale,
  9642. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  9643. il);
  9644. cb(moe_out, "ffn_moe_out", il);
  9645. // FFN shared expert
  9646. {
  9647. ggml_tensor * ffn_shexp = build_ffn(cur,
  9648. model.layers[il].ffn_up_shexp, NULL, NULL,
  9649. model.layers[il].ffn_gate_shexp, NULL, NULL,
  9650. model.layers[il].ffn_down_shexp, NULL, NULL,
  9651. NULL,
  9652. LLM_FFN_SILU, LLM_FFN_PAR, il);
  9653. cb(ffn_shexp, "ffn_shexp", il);
  9654. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  9655. cb(cur, "ffn_out", il);
  9656. }
  9657. cur = ggml_add(ctx0, cur, ffn_inp);
  9658. cur = build_cvec(cur, il);
  9659. cb(cur, "l_out", il);
  9660. // input for next layer
  9661. inpL = cur;
  9662. }
  9663. cur = inpL;
  9664. cur = build_norm(cur,
  9665. model.output_norm, NULL,
  9666. LLM_NORM_RMS, -1);
  9667. cb(cur, "result_norm", -1);
  9668. res->t_embd = cur;
  9669. // lm_head
  9670. cur = build_lora_mm(model.output, cur);
  9671. cb(cur, "result_output", -1);
  9672. res->t_logits = cur;
  9673. ggml_build_forward_expand(gf, cur);
  9674. }
  9675. };
  9676. llama_memory_i * llama_model::create_memory() const {
  9677. llama_memory_i * res;
  9678. switch (arch) {
  9679. case LLM_ARCH_MAMBA:
  9680. case LLM_ARCH_RWKV6:
  9681. case LLM_ARCH_RWKV6QWEN2:
  9682. case LLM_ARCH_RWKV7:
  9683. case LLM_ARCH_ARWKV7:
  9684. {
  9685. res = new llama_kv_cache_unified(hparams, {
  9686. /*.get_rope_factors =*/ nullptr
  9687. });
  9688. } break;
  9689. default:
  9690. {
  9691. res = new llama_kv_cache_unified(hparams, {
  9692. /*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) {
  9693. // choose long/short freq factors based on the context size
  9694. if (layers[il].rope_freqs != nullptr) {
  9695. return layers[il].rope_freqs;
  9696. }
  9697. if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
  9698. return layers[il].rope_long;
  9699. }
  9700. return layers[il].rope_short;
  9701. }
  9702. });
  9703. }
  9704. }
  9705. return res;
  9706. }
  9707. llm_graph_result_ptr llama_model::build_graph(
  9708. const llm_graph_params & params,
  9709. ggml_cgraph * gf,
  9710. llm_graph_type type) const {
  9711. std::unique_ptr<llm_graph_context> llm;
  9712. switch (arch) {
  9713. case LLM_ARCH_LLAMA:
  9714. case LLM_ARCH_LLAMA4:
  9715. case LLM_ARCH_MINICPM:
  9716. case LLM_ARCH_GRANITE:
  9717. case LLM_ARCH_GRANITE_MOE:
  9718. {
  9719. llm = std::make_unique<llm_build_llama>(*this, params, gf);
  9720. } break;
  9721. case LLM_ARCH_DECI:
  9722. {
  9723. llm = std::make_unique<llm_build_deci>(*this, params, gf);
  9724. } break;
  9725. case LLM_ARCH_BAICHUAN:
  9726. {
  9727. llm = std::make_unique<llm_build_baichuan>(*this, params, gf);
  9728. } break;
  9729. case LLM_ARCH_FALCON:
  9730. {
  9731. llm = std::make_unique<llm_build_falcon>(*this, params, gf);
  9732. } break;
  9733. case LLM_ARCH_GROK:
  9734. {
  9735. llm = std::make_unique<llm_build_grok>(*this, params, gf);
  9736. } break;
  9737. case LLM_ARCH_STARCODER:
  9738. {
  9739. llm = std::make_unique<llm_build_starcoder>(*this, params, gf);
  9740. } break;
  9741. case LLM_ARCH_REFACT:
  9742. {
  9743. llm = std::make_unique<llm_build_refact>(*this, params, gf);
  9744. } break;
  9745. case LLM_ARCH_BERT:
  9746. case LLM_ARCH_JINA_BERT_V2:
  9747. case LLM_ARCH_NOMIC_BERT:
  9748. {
  9749. llm = std::make_unique<llm_build_bert>(*this, params, gf);
  9750. } break;
  9751. case LLM_ARCH_BLOOM:
  9752. {
  9753. llm = std::make_unique<llm_build_bloom>(*this, params, gf);
  9754. } break;
  9755. case LLM_ARCH_MPT:
  9756. {
  9757. llm = std::make_unique<llm_build_mpt>(*this, params, gf);
  9758. } break;
  9759. case LLM_ARCH_STABLELM:
  9760. {
  9761. llm = std::make_unique<llm_build_stablelm>(*this, params, gf);
  9762. } break;
  9763. case LLM_ARCH_QWEN:
  9764. {
  9765. llm = std::make_unique<llm_build_qwen>(*this, params, gf);
  9766. } break;
  9767. case LLM_ARCH_QWEN2:
  9768. {
  9769. llm = std::make_unique<llm_build_qwen2>(*this, params, gf);
  9770. } break;
  9771. case LLM_ARCH_QWEN2VL:
  9772. {
  9773. llm = std::make_unique<llm_build_qwen2vl>(*this, params, gf);
  9774. } break;
  9775. case LLM_ARCH_QWEN2MOE:
  9776. {
  9777. llm = std::make_unique<llm_build_qwen2moe>(*this, params, gf);
  9778. } break;
  9779. case LLM_ARCH_PHI2:
  9780. {
  9781. llm = std::make_unique<llm_build_phi2>(*this, params, gf);
  9782. } break;
  9783. case LLM_ARCH_PHI3:
  9784. case LLM_ARCH_PHIMOE:
  9785. {
  9786. llm = std::make_unique<llm_build_phi3>(*this, params, gf);
  9787. } break;
  9788. case LLM_ARCH_PLAMO:
  9789. {
  9790. llm = std::make_unique<llm_build_plamo>(*this, params, gf);
  9791. } break;
  9792. case LLM_ARCH_GPT2:
  9793. {
  9794. llm = std::make_unique<llm_build_gpt2>(*this, params, gf);
  9795. } break;
  9796. case LLM_ARCH_CODESHELL:
  9797. {
  9798. llm = std::make_unique<llm_build_codeshell>(*this, params, gf);
  9799. } break;
  9800. case LLM_ARCH_ORION:
  9801. {
  9802. llm = std::make_unique<llm_build_orion>(*this, params, gf);
  9803. } break;
  9804. case LLM_ARCH_INTERNLM2:
  9805. {
  9806. llm = std::make_unique<llm_build_internlm2>(*this, params, gf);
  9807. } break;
  9808. case LLM_ARCH_MINICPM3:
  9809. {
  9810. llm = std::make_unique<llm_build_minicpm3>(*this, params, gf);
  9811. } break;
  9812. case LLM_ARCH_GEMMA:
  9813. {
  9814. llm = std::make_unique<llm_build_gemma>(*this, params, gf);
  9815. } break;
  9816. case LLM_ARCH_GEMMA2:
  9817. {
  9818. llm = std::make_unique<llm_build_gemma2>(*this, params, gf);
  9819. } break;
  9820. case LLM_ARCH_GEMMA3:
  9821. {
  9822. llm = std::make_unique<llm_build_gemma3>(*this, params, gf);
  9823. } break;
  9824. case LLM_ARCH_STARCODER2:
  9825. {
  9826. llm = std::make_unique<llm_build_starcoder2>(*this, params, gf);
  9827. } break;
  9828. case LLM_ARCH_MAMBA:
  9829. {
  9830. llm = std::make_unique<llm_build_mamba>(*this, params, gf);
  9831. } break;
  9832. case LLM_ARCH_XVERSE:
  9833. {
  9834. llm = std::make_unique<llm_build_xverse>(*this, params, gf);
  9835. } break;
  9836. case LLM_ARCH_COMMAND_R:
  9837. {
  9838. llm = std::make_unique<llm_build_command_r>(*this, params, gf);
  9839. } break;
  9840. case LLM_ARCH_COHERE2:
  9841. {
  9842. llm = std::make_unique<llm_build_cohere2>(*this, params, gf);
  9843. } break;
  9844. case LLM_ARCH_DBRX:
  9845. {
  9846. llm = std::make_unique<llm_build_dbrx>(*this, params, gf);
  9847. } break;
  9848. case LLM_ARCH_OLMO:
  9849. {
  9850. llm = std::make_unique<llm_build_olmo>(*this, params, gf);
  9851. } break;
  9852. case LLM_ARCH_OLMO2:
  9853. {
  9854. llm = std::make_unique<llm_build_olmo2>(*this, params, gf);
  9855. } break;
  9856. case LLM_ARCH_OLMOE:
  9857. {
  9858. llm = std::make_unique<llm_build_olmoe>(*this, params, gf);
  9859. } break;
  9860. case LLM_ARCH_OPENELM:
  9861. {
  9862. llm = std::make_unique<llm_build_openelm>(*this, params, gf);
  9863. } break;
  9864. case LLM_ARCH_GPTNEOX:
  9865. {
  9866. llm = std::make_unique<llm_build_gptneox>(*this, params, gf);
  9867. } break;
  9868. case LLM_ARCH_ARCTIC:
  9869. {
  9870. llm = std::make_unique<llm_build_arctic>(*this, params, gf);
  9871. } break;
  9872. case LLM_ARCH_DEEPSEEK:
  9873. {
  9874. llm = std::make_unique<llm_build_deepseek>(*this, params, gf);
  9875. } break;
  9876. case LLM_ARCH_DEEPSEEK2:
  9877. {
  9878. llm = std::make_unique<llm_build_deepseek2>(*this, params, gf);
  9879. } break;
  9880. case LLM_ARCH_CHATGLM:
  9881. {
  9882. llm = std::make_unique<llm_build_chatglm>(*this, params, gf);
  9883. } break;
  9884. case LLM_ARCH_BITNET:
  9885. {
  9886. llm = std::make_unique<llm_build_bitnet>(*this, params, gf);
  9887. } break;
  9888. case LLM_ARCH_T5:
  9889. {
  9890. switch (type) {
  9891. case LLM_GRAPH_TYPE_ENCODER:
  9892. llm = std::make_unique<llm_build_t5_enc>(*this, params, gf);
  9893. break;
  9894. case LLM_GRAPH_TYPE_DEFAULT:
  9895. case LLM_GRAPH_TYPE_DECODER:
  9896. llm = std::make_unique<llm_build_t5_dec>(*this, params, gf);
  9897. break;
  9898. default:
  9899. GGML_ABORT("invalid graph type");
  9900. };
  9901. } break;
  9902. case LLM_ARCH_T5ENCODER:
  9903. {
  9904. llm = std::make_unique<llm_build_t5_enc>(*this, params, gf);
  9905. }
  9906. break;
  9907. case LLM_ARCH_JAIS:
  9908. {
  9909. llm = std::make_unique<llm_build_jais>(*this, params, gf);
  9910. } break;
  9911. case LLM_ARCH_NEMOTRON:
  9912. {
  9913. llm = std::make_unique<llm_build_nemotron>(*this, params, gf);
  9914. } break;
  9915. case LLM_ARCH_EXAONE:
  9916. {
  9917. llm = std::make_unique<llm_build_exaone>(*this, params, gf);
  9918. } break;
  9919. case LLM_ARCH_RWKV6:
  9920. {
  9921. llm = std::make_unique<llm_build_rwkv6>(*this, params, gf);
  9922. } break;
  9923. case LLM_ARCH_RWKV6QWEN2:
  9924. {
  9925. llm = std::make_unique<llm_build_rwkv6qwen2>(*this, params, gf);
  9926. } break;
  9927. case LLM_ARCH_RWKV7:
  9928. {
  9929. llm = std::make_unique<llm_build_rwkv7>(*this, params, gf);
  9930. } break;
  9931. case LLM_ARCH_ARWKV7:
  9932. {
  9933. llm = std::make_unique<llm_build_arwkv7>(*this, params, gf);
  9934. } break;
  9935. case LLM_ARCH_CHAMELEON:
  9936. {
  9937. llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
  9938. } break;
  9939. case LLM_ARCH_WAVTOKENIZER_DEC:
  9940. {
  9941. llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
  9942. } break;
  9943. case LLM_ARCH_PLM:
  9944. {
  9945. llm = std::make_unique<llm_build_plm>(*this, params, gf);
  9946. } break;
  9947. case LLM_ARCH_BAILINGMOE:
  9948. {
  9949. llm = std::make_unique<llm_build_bailingmoe>(*this, params, gf);
  9950. } break;
  9951. default:
  9952. GGML_ABORT("fatal error");
  9953. }
  9954. // add on pooling layer
  9955. llm->build_pooling(gf, cls, cls_b, cls_out, cls_out_b);
  9956. return std::move(llm->res);
  9957. }
  9958. //
  9959. // interface implementation
  9960. //
  9961. llama_model_params llama_model_default_params() {
  9962. llama_model_params result = {
  9963. /*.devices =*/ nullptr,
  9964. /*.tensor_buft_overrides =*/ nullptr,
  9965. /*.n_gpu_layers =*/ 0,
  9966. /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
  9967. /*.main_gpu =*/ 0,
  9968. /*.tensor_split =*/ nullptr,
  9969. /*.progress_callback =*/ nullptr,
  9970. /*.progress_callback_user_data =*/ nullptr,
  9971. /*.kv_overrides =*/ nullptr,
  9972. /*.vocab_only =*/ false,
  9973. /*.use_mmap =*/ true,
  9974. /*.use_mlock =*/ false,
  9975. /*.check_tensors =*/ false,
  9976. };
  9977. #ifdef GGML_USE_METAL
  9978. // note: we usually have plenty of VRAM, so by default offload all layers to the GPU
  9979. result.n_gpu_layers = 999;
  9980. #endif
  9981. return result;
  9982. }
  9983. const llama_vocab * llama_model_get_vocab(const llama_model * model) {
  9984. return &model->vocab;
  9985. }
  9986. void llama_free_model(llama_model * model) {
  9987. llama_model_free(model);
  9988. }
  9989. void llama_model_free(llama_model * model) {
  9990. delete model;
  9991. }
  9992. int32_t llama_model_n_ctx_train(const llama_model * model) {
  9993. return model->hparams.n_ctx_train;
  9994. }
  9995. int32_t llama_model_n_embd(const llama_model * model) {
  9996. return model->hparams.n_embd;
  9997. }
  9998. int32_t llama_model_n_layer(const llama_model * model) {
  9999. return model->hparams.n_layer;
  10000. }
  10001. int32_t llama_model_n_head(const llama_model * model) {
  10002. return model->hparams.n_head();
  10003. }
  10004. int32_t llama_model_n_head_kv(const llama_model * model) {
  10005. return model->hparams.n_head_kv();
  10006. }
  10007. // deprecated
  10008. int32_t llama_n_ctx_train(const llama_model * model) {
  10009. return llama_model_n_ctx_train(model);
  10010. }
  10011. // deprecated
  10012. int32_t llama_n_embd(const llama_model * model) {
  10013. return llama_model_n_embd(model);
  10014. }
  10015. // deprecated
  10016. int32_t llama_n_layer(const llama_model * model) {
  10017. return llama_model_n_layer(model);
  10018. }
  10019. // deprecated
  10020. int32_t llama_n_head(const llama_model * model) {
  10021. return llama_model_n_head(model);
  10022. }
  10023. llama_rope_type llama_model_rope_type(const llama_model * model) {
  10024. switch (model->arch) {
  10025. // these models do not use RoPE
  10026. case LLM_ARCH_GPT2:
  10027. case LLM_ARCH_GPTJ:
  10028. case LLM_ARCH_MPT:
  10029. case LLM_ARCH_REFACT:
  10030. case LLM_ARCH_BLOOM:
  10031. case LLM_ARCH_MAMBA:
  10032. case LLM_ARCH_JINA_BERT_V2:
  10033. case LLM_ARCH_T5:
  10034. case LLM_ARCH_T5ENCODER:
  10035. case LLM_ARCH_JAIS:
  10036. case LLM_ARCH_RWKV6:
  10037. case LLM_ARCH_RWKV6QWEN2:
  10038. case LLM_ARCH_RWKV7:
  10039. case LLM_ARCH_ARWKV7:
  10040. case LLM_ARCH_WAVTOKENIZER_DEC:
  10041. return LLAMA_ROPE_TYPE_NONE;
  10042. // use what we call a normal RoPE, operating on pairs of consecutive head values
  10043. case LLM_ARCH_LLAMA:
  10044. case LLM_ARCH_LLAMA4:
  10045. case LLM_ARCH_DECI:
  10046. case LLM_ARCH_BAICHUAN:
  10047. case LLM_ARCH_STARCODER:
  10048. case LLM_ARCH_PLAMO:
  10049. case LLM_ARCH_ORION:
  10050. case LLM_ARCH_INTERNLM2:
  10051. case LLM_ARCH_MINICPM:
  10052. case LLM_ARCH_XVERSE:
  10053. case LLM_ARCH_COMMAND_R:
  10054. case LLM_ARCH_COHERE2:
  10055. case LLM_ARCH_OLMO:
  10056. case LLM_ARCH_ARCTIC:
  10057. case LLM_ARCH_DEEPSEEK:
  10058. case LLM_ARCH_DEEPSEEK2:
  10059. case LLM_ARCH_PLM:
  10060. case LLM_ARCH_CHATGLM:
  10061. case LLM_ARCH_GRANITE:
  10062. case LLM_ARCH_GRANITE_MOE:
  10063. case LLM_ARCH_CHAMELEON:
  10064. case LLM_ARCH_BAILINGMOE:
  10065. return LLAMA_ROPE_TYPE_NORM;
  10066. // the pairs of head values are offset by n_rot/2
  10067. case LLM_ARCH_FALCON:
  10068. case LLM_ARCH_GROK:
  10069. case LLM_ARCH_DBRX:
  10070. case LLM_ARCH_BERT:
  10071. case LLM_ARCH_NOMIC_BERT:
  10072. case LLM_ARCH_STABLELM:
  10073. case LLM_ARCH_BITNET:
  10074. case LLM_ARCH_QWEN:
  10075. case LLM_ARCH_QWEN2:
  10076. case LLM_ARCH_QWEN2MOE:
  10077. case LLM_ARCH_OLMO2:
  10078. case LLM_ARCH_OLMOE:
  10079. case LLM_ARCH_PHI2:
  10080. case LLM_ARCH_PHI3:
  10081. case LLM_ARCH_PHIMOE:
  10082. case LLM_ARCH_GEMMA:
  10083. case LLM_ARCH_GEMMA2:
  10084. case LLM_ARCH_GEMMA3:
  10085. case LLM_ARCH_STARCODER2:
  10086. case LLM_ARCH_OPENELM:
  10087. case LLM_ARCH_GPTNEOX:
  10088. case LLM_ARCH_CODESHELL:
  10089. case LLM_ARCH_NEMOTRON:
  10090. case LLM_ARCH_EXAONE:
  10091. case LLM_ARCH_MINICPM3:
  10092. return LLAMA_ROPE_TYPE_NEOX;
  10093. case LLM_ARCH_QWEN2VL:
  10094. return LLAMA_ROPE_TYPE_MROPE;
  10095. // all model arches should be listed explicitly here
  10096. case LLM_ARCH_UNKNOWN:
  10097. GGML_ABORT("unknown architecture");
  10098. }
  10099. return LLAMA_ROPE_TYPE_NONE;
  10100. }
  10101. float llama_model_rope_freq_scale_train(const llama_model * model) {
  10102. return model->hparams.rope_freq_scale_train;
  10103. }
  10104. int32_t llama_model_meta_val_str(const llama_model * model, const char * key, char * buf, size_t buf_size) {
  10105. const auto & it = model->gguf_kv.find(key);
  10106. if (it == model->gguf_kv.end()) {
  10107. if (buf_size > 0) {
  10108. buf[0] = '\0';
  10109. }
  10110. return -1;
  10111. }
  10112. return snprintf(buf, buf_size, "%s", it->second.c_str());
  10113. }
  10114. int32_t llama_model_meta_count(const llama_model * model) {
  10115. return (int)model->gguf_kv.size();
  10116. }
  10117. int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
  10118. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  10119. if (buf_size > 0) {
  10120. buf[0] = '\0';
  10121. }
  10122. return -1;
  10123. }
  10124. auto it = model->gguf_kv.begin();
  10125. std::advance(it, i);
  10126. return snprintf(buf, buf_size, "%s", it->first.c_str());
  10127. }
  10128. int32_t llama_model_meta_val_str_by_index(const llama_model * model, int32_t i, char * buf, size_t buf_size) {
  10129. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  10130. if (buf_size > 0) {
  10131. buf[0] = '\0';
  10132. }
  10133. return -1;
  10134. }
  10135. auto it = model->gguf_kv.begin();
  10136. std::advance(it, i);
  10137. return snprintf(buf, buf_size, "%s", it->second.c_str());
  10138. }
  10139. int32_t llama_model_desc(const llama_model * model, char * buf, size_t buf_size) {
  10140. return snprintf(buf, buf_size, "%s", model->desc().c_str());
  10141. }
  10142. uint64_t llama_model_size(const llama_model * model) {
  10143. return model->size();
  10144. }
  10145. const char * llama_model_chat_template(const llama_model * model, const char * name) {
  10146. const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE_N)
  10147. : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
  10148. const auto & it = model->gguf_kv.find(key);
  10149. if (it == model->gguf_kv.end()) {
  10150. return nullptr;
  10151. }
  10152. return it->second.c_str();
  10153. }
  10154. uint64_t llama_model_n_params(const llama_model * model) {
  10155. return model->n_elements();
  10156. }
  10157. bool llama_model_has_encoder(const llama_model * model) {
  10158. switch (model->arch) {
  10159. case LLM_ARCH_T5: return true;
  10160. case LLM_ARCH_T5ENCODER: return true;
  10161. default: return false;
  10162. }
  10163. }
  10164. bool llama_model_has_decoder(const llama_model * model) {
  10165. switch (model->arch) {
  10166. case LLM_ARCH_T5ENCODER: return false;
  10167. default: return true;
  10168. }
  10169. }
  10170. llama_token llama_model_decoder_start_token(const llama_model * model) {
  10171. return model->hparams.dec_start_token_id;
  10172. }
  10173. bool llama_model_is_recurrent(const llama_model * model) {
  10174. switch (model->arch) {
  10175. case LLM_ARCH_MAMBA: return true;
  10176. case LLM_ARCH_RWKV6: return true;
  10177. case LLM_ARCH_RWKV6QWEN2: return true;
  10178. case LLM_ARCH_RWKV7: return true;
  10179. case LLM_ARCH_ARWKV7: return true;
  10180. default: return false;
  10181. }
  10182. }
  10183. const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
  10184. return model->tensors_by_name;
  10185. }