llama-model.cpp 767 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558855985608561856285638564856585668567856885698570857185728573857485758576857785788579858085818582858385848585858685878588858985908591859285938594859585968597859885998600860186028603860486058606860786088609861086118612861386148615861686178618861986208621862286238624862586268627862886298630863186328633863486358636863786388639864086418642864386448645864686478648864986508651865286538654865586568657865886598660866186628663866486658666866786688669867086718672867386748675867686778678867986808681868286838684868586868687868886898690869186928693869486958696869786988699870087018702870387048705870687078708870987108711871287138714871587168717871887198720872187228723872487258726872787288729873087318732873387348735873687378738873987408741874287438744874587468747874887498750875187528753875487558756875787588759876087618762876387648765876687678768876987708771877287738774877587768777877887798780878187828783878487858786878787888789879087918792879387948795879687978798879988008801880288038804880588068807880888098810881188128813881488158816881788188819882088218822882388248825882688278828882988308831883288338834883588368837883888398840884188428843884488458846884788488849885088518852885388548855885688578858885988608861886288638864886588668867886888698870887188728873887488758876887788788879888088818882888388848885888688878888888988908891889288938894889588968897889888998900890189028903890489058906890789088909891089118912891389148915891689178918891989208921892289238924892589268927892889298930893189328933893489358936893789388939894089418942894389448945894689478948894989508951895289538954895589568957895889598960896189628963896489658966896789688969897089718972897389748975897689778978897989808981898289838984898589868987898889898990899189928993899489958996899789988999900090019002900390049005900690079008900990109011901290139014901590169017901890199020902190229023902490259026902790289029903090319032903390349035903690379038903990409041904290439044904590469047904890499050905190529053905490559056905790589059906090619062906390649065906690679068906990709071907290739074907590769077907890799080908190829083908490859086908790889089909090919092909390949095909690979098909991009101910291039104910591069107910891099110911191129113911491159116911791189119912091219122912391249125912691279128912991309131913291339134913591369137913891399140914191429143914491459146914791489149915091519152915391549155915691579158915991609161916291639164916591669167916891699170917191729173917491759176917791789179918091819182918391849185918691879188918991909191919291939194919591969197919891999200920192029203920492059206920792089209921092119212921392149215921692179218921992209221922292239224922592269227922892299230923192329233923492359236923792389239924092419242924392449245924692479248924992509251925292539254925592569257925892599260926192629263926492659266926792689269927092719272927392749275927692779278927992809281928292839284928592869287928892899290929192929293929492959296929792989299930093019302930393049305930693079308930993109311931293139314931593169317931893199320932193229323932493259326932793289329933093319332933393349335933693379338933993409341934293439344934593469347934893499350935193529353935493559356935793589359936093619362936393649365936693679368936993709371937293739374937593769377937893799380938193829383938493859386938793889389939093919392939393949395939693979398939994009401940294039404940594069407940894099410941194129413941494159416941794189419942094219422942394249425942694279428942994309431943294339434943594369437943894399440944194429443944494459446944794489449945094519452945394549455945694579458945994609461946294639464946594669467946894699470947194729473947494759476947794789479948094819482948394849485948694879488948994909491949294939494949594969497949894999500950195029503950495059506950795089509951095119512951395149515951695179518951995209521952295239524952595269527952895299530953195329533953495359536953795389539954095419542954395449545954695479548954995509551955295539554955595569557955895599560956195629563956495659566956795689569957095719572957395749575957695779578957995809581958295839584958595869587958895899590959195929593959495959596959795989599960096019602960396049605960696079608960996109611961296139614961596169617961896199620962196229623962496259626962796289629963096319632963396349635963696379638963996409641964296439644964596469647964896499650965196529653965496559656965796589659966096619662966396649665966696679668966996709671967296739674967596769677967896799680968196829683968496859686968796889689969096919692969396949695969696979698969997009701970297039704970597069707970897099710971197129713971497159716971797189719972097219722972397249725972697279728972997309731973297339734973597369737973897399740974197429743974497459746974797489749975097519752975397549755975697579758975997609761976297639764976597669767976897699770977197729773977497759776977797789779978097819782978397849785978697879788978997909791979297939794979597969797979897999800980198029803980498059806980798089809981098119812981398149815981698179818981998209821982298239824982598269827982898299830983198329833983498359836983798389839984098419842984398449845984698479848984998509851985298539854985598569857985898599860986198629863986498659866986798689869987098719872987398749875987698779878987998809881988298839884988598869887988898899890989198929893989498959896989798989899990099019902990399049905990699079908990999109911991299139914991599169917991899199920992199229923992499259926992799289929993099319932993399349935993699379938993999409941994299439944994599469947994899499950995199529953995499559956995799589959996099619962996399649965996699679968996999709971997299739974997599769977997899799980998199829983998499859986998799889989999099919992999399949995999699979998999910000100011000210003100041000510006100071000810009100101001110012100131001410015100161001710018100191002010021100221002310024100251002610027100281002910030100311003210033100341003510036100371003810039100401004110042100431004410045100461004710048100491005010051100521005310054100551005610057100581005910060100611006210063100641006510066100671006810069100701007110072100731007410075100761007710078100791008010081100821008310084100851008610087100881008910090100911009210093100941009510096100971009810099101001010110102101031010410105101061010710108101091011010111101121011310114101151011610117101181011910120101211012210123101241012510126101271012810129101301013110132101331013410135101361013710138101391014010141101421014310144101451014610147101481014910150101511015210153101541015510156101571015810159101601016110162101631016410165101661016710168101691017010171101721017310174101751017610177101781017910180101811018210183101841018510186101871018810189101901019110192101931019410195101961019710198101991020010201102021020310204102051020610207102081020910210102111021210213102141021510216102171021810219102201022110222102231022410225102261022710228102291023010231102321023310234102351023610237102381023910240102411024210243102441024510246102471024810249102501025110252102531025410255102561025710258102591026010261102621026310264102651026610267102681026910270102711027210273102741027510276102771027810279102801028110282102831028410285102861028710288102891029010291102921029310294102951029610297102981029910300103011030210303103041030510306103071030810309103101031110312103131031410315103161031710318103191032010321103221032310324103251032610327103281032910330103311033210333103341033510336103371033810339103401034110342103431034410345103461034710348103491035010351103521035310354103551035610357103581035910360103611036210363103641036510366103671036810369103701037110372103731037410375103761037710378103791038010381103821038310384103851038610387103881038910390103911039210393103941039510396103971039810399104001040110402104031040410405104061040710408104091041010411104121041310414104151041610417104181041910420104211042210423104241042510426104271042810429104301043110432104331043410435104361043710438104391044010441104421044310444104451044610447104481044910450104511045210453104541045510456104571045810459104601046110462104631046410465104661046710468104691047010471104721047310474104751047610477104781047910480104811048210483104841048510486104871048810489104901049110492104931049410495104961049710498104991050010501105021050310504105051050610507105081050910510105111051210513105141051510516105171051810519105201052110522105231052410525105261052710528105291053010531105321053310534105351053610537105381053910540105411054210543105441054510546105471054810549105501055110552105531055410555105561055710558105591056010561105621056310564105651056610567105681056910570105711057210573105741057510576105771057810579105801058110582105831058410585105861058710588105891059010591105921059310594105951059610597105981059910600106011060210603106041060510606106071060810609106101061110612106131061410615106161061710618106191062010621106221062310624106251062610627106281062910630106311063210633106341063510636106371063810639106401064110642106431064410645106461064710648106491065010651106521065310654106551065610657106581065910660106611066210663106641066510666106671066810669106701067110672106731067410675106761067710678106791068010681106821068310684106851068610687106881068910690106911069210693106941069510696106971069810699107001070110702107031070410705107061070710708107091071010711107121071310714107151071610717107181071910720107211072210723107241072510726107271072810729107301073110732107331073410735107361073710738107391074010741107421074310744107451074610747107481074910750107511075210753107541075510756107571075810759107601076110762107631076410765107661076710768107691077010771107721077310774107751077610777107781077910780107811078210783107841078510786107871078810789107901079110792107931079410795107961079710798107991080010801108021080310804108051080610807108081080910810108111081210813108141081510816108171081810819108201082110822108231082410825108261082710828108291083010831108321083310834108351083610837108381083910840108411084210843108441084510846108471084810849108501085110852108531085410855108561085710858108591086010861108621086310864108651086610867108681086910870108711087210873108741087510876108771087810879108801088110882108831088410885108861088710888108891089010891108921089310894108951089610897108981089910900109011090210903109041090510906109071090810909109101091110912109131091410915109161091710918109191092010921109221092310924109251092610927109281092910930109311093210933109341093510936109371093810939109401094110942109431094410945109461094710948109491095010951109521095310954109551095610957109581095910960109611096210963109641096510966109671096810969109701097110972109731097410975109761097710978109791098010981109821098310984109851098610987109881098910990109911099210993109941099510996109971099810999110001100111002110031100411005110061100711008110091101011011110121101311014110151101611017110181101911020110211102211023110241102511026110271102811029110301103111032110331103411035110361103711038110391104011041110421104311044110451104611047110481104911050110511105211053110541105511056110571105811059110601106111062110631106411065110661106711068110691107011071110721107311074110751107611077110781107911080110811108211083110841108511086110871108811089110901109111092110931109411095110961109711098110991110011101111021110311104111051110611107111081110911110111111111211113111141111511116111171111811119111201112111122111231112411125111261112711128111291113011131111321113311134111351113611137111381113911140111411114211143111441114511146111471114811149111501115111152111531115411155111561115711158111591116011161111621116311164111651116611167111681116911170111711117211173111741117511176111771117811179111801118111182111831118411185111861118711188111891119011191111921119311194111951119611197111981119911200112011120211203112041120511206112071120811209112101121111212112131121411215112161121711218112191122011221112221122311224112251122611227112281122911230112311123211233112341123511236112371123811239112401124111242112431124411245112461124711248112491125011251112521125311254112551125611257112581125911260112611126211263112641126511266112671126811269112701127111272112731127411275112761127711278112791128011281112821128311284112851128611287112881128911290112911129211293112941129511296112971129811299113001130111302113031130411305113061130711308113091131011311113121131311314113151131611317113181131911320113211132211323113241132511326113271132811329113301133111332113331133411335113361133711338113391134011341113421134311344113451134611347113481134911350113511135211353113541135511356113571135811359113601136111362113631136411365113661136711368113691137011371113721137311374113751137611377113781137911380113811138211383113841138511386113871138811389113901139111392113931139411395113961139711398113991140011401114021140311404114051140611407114081140911410114111141211413114141141511416114171141811419114201142111422114231142411425114261142711428114291143011431114321143311434114351143611437114381143911440114411144211443114441144511446114471144811449114501145111452114531145411455114561145711458114591146011461114621146311464114651146611467114681146911470114711147211473114741147511476114771147811479114801148111482114831148411485114861148711488114891149011491114921149311494114951149611497114981149911500115011150211503115041150511506115071150811509115101151111512115131151411515115161151711518115191152011521115221152311524115251152611527115281152911530115311153211533115341153511536115371153811539115401154111542115431154411545115461154711548115491155011551115521155311554115551155611557115581155911560115611156211563115641156511566115671156811569115701157111572115731157411575115761157711578115791158011581115821158311584115851158611587115881158911590115911159211593115941159511596115971159811599116001160111602116031160411605116061160711608116091161011611116121161311614116151161611617116181161911620116211162211623116241162511626116271162811629116301163111632116331163411635116361163711638116391164011641116421164311644116451164611647116481164911650116511165211653116541165511656116571165811659116601166111662116631166411665116661166711668116691167011671116721167311674116751167611677116781167911680116811168211683116841168511686116871168811689116901169111692116931169411695116961169711698116991170011701117021170311704117051170611707117081170911710117111171211713117141171511716117171171811719117201172111722117231172411725117261172711728117291173011731117321173311734117351173611737117381173911740117411174211743117441174511746117471174811749117501175111752117531175411755117561175711758117591176011761117621176311764117651176611767117681176911770117711177211773117741177511776117771177811779117801178111782117831178411785117861178711788117891179011791117921179311794117951179611797117981179911800118011180211803118041180511806118071180811809118101181111812118131181411815118161181711818118191182011821118221182311824118251182611827118281182911830118311183211833118341183511836118371183811839118401184111842118431184411845118461184711848118491185011851118521185311854118551185611857118581185911860118611186211863118641186511866118671186811869118701187111872118731187411875118761187711878118791188011881118821188311884118851188611887118881188911890118911189211893118941189511896118971189811899119001190111902119031190411905119061190711908119091191011911119121191311914119151191611917119181191911920119211192211923119241192511926119271192811929119301193111932119331193411935119361193711938119391194011941119421194311944119451194611947119481194911950119511195211953119541195511956119571195811959119601196111962119631196411965119661196711968119691197011971119721197311974119751197611977119781197911980119811198211983119841198511986119871198811989119901199111992119931199411995119961199711998119991200012001120021200312004120051200612007120081200912010120111201212013120141201512016120171201812019120201202112022120231202412025120261202712028120291203012031120321203312034120351203612037120381203912040120411204212043120441204512046120471204812049120501205112052120531205412055120561205712058120591206012061120621206312064120651206612067120681206912070120711207212073120741207512076120771207812079120801208112082120831208412085120861208712088120891209012091120921209312094120951209612097120981209912100121011210212103121041210512106121071210812109121101211112112121131211412115121161211712118121191212012121121221212312124121251212612127121281212912130121311213212133121341213512136121371213812139121401214112142121431214412145121461214712148121491215012151121521215312154121551215612157121581215912160121611216212163121641216512166121671216812169121701217112172121731217412175121761217712178121791218012181121821218312184121851218612187121881218912190121911219212193121941219512196121971219812199122001220112202122031220412205122061220712208122091221012211122121221312214122151221612217122181221912220122211222212223122241222512226122271222812229122301223112232122331223412235122361223712238122391224012241122421224312244122451224612247122481224912250122511225212253122541225512256122571225812259122601226112262122631226412265122661226712268122691227012271122721227312274122751227612277122781227912280122811228212283122841228512286122871228812289122901229112292122931229412295122961229712298122991230012301123021230312304123051230612307123081230912310123111231212313123141231512316123171231812319123201232112322123231232412325123261232712328123291233012331123321233312334123351233612337123381233912340123411234212343123441234512346123471234812349123501235112352123531235412355123561235712358123591236012361123621236312364123651236612367123681236912370123711237212373123741237512376123771237812379123801238112382123831238412385123861238712388123891239012391123921239312394123951239612397123981239912400124011240212403124041240512406124071240812409124101241112412124131241412415124161241712418124191242012421124221242312424124251242612427124281242912430124311243212433124341243512436124371243812439124401244112442124431244412445124461244712448124491245012451124521245312454124551245612457124581245912460124611246212463124641246512466124671246812469124701247112472124731247412475124761247712478124791248012481124821248312484124851248612487124881248912490124911249212493124941249512496124971249812499125001250112502125031250412505125061250712508125091251012511125121251312514125151251612517125181251912520125211252212523125241252512526125271252812529125301253112532125331253412535125361253712538125391254012541125421254312544125451254612547125481254912550125511255212553125541255512556125571255812559125601256112562125631256412565125661256712568125691257012571125721257312574125751257612577125781257912580125811258212583125841258512586125871258812589125901259112592125931259412595125961259712598125991260012601126021260312604126051260612607126081260912610126111261212613126141261512616126171261812619126201262112622126231262412625126261262712628126291263012631126321263312634126351263612637126381263912640126411264212643126441264512646126471264812649126501265112652126531265412655126561265712658126591266012661126621266312664126651266612667126681266912670126711267212673126741267512676126771267812679126801268112682126831268412685126861268712688126891269012691126921269312694126951269612697126981269912700127011270212703127041270512706127071270812709127101271112712127131271412715127161271712718127191272012721127221272312724127251272612727127281272912730127311273212733127341273512736127371273812739127401274112742127431274412745127461274712748127491275012751127521275312754127551275612757127581275912760127611276212763127641276512766127671276812769127701277112772127731277412775127761277712778127791278012781127821278312784127851278612787127881278912790127911279212793127941279512796127971279812799128001280112802128031280412805128061280712808128091281012811128121281312814128151281612817128181281912820128211282212823128241282512826128271282812829128301283112832128331283412835128361283712838128391284012841128421284312844128451284612847128481284912850128511285212853128541285512856128571285812859128601286112862128631286412865128661286712868128691287012871128721287312874128751287612877128781287912880128811288212883128841288512886128871288812889128901289112892128931289412895128961289712898128991290012901129021290312904129051290612907129081290912910129111291212913129141291512916129171291812919129201292112922129231292412925129261292712928129291293012931129321293312934129351293612937129381293912940129411294212943129441294512946129471294812949129501295112952129531295412955129561295712958129591296012961129621296312964129651296612967129681296912970129711297212973129741297512976129771297812979129801298112982129831298412985129861298712988129891299012991129921299312994129951299612997129981299913000130011300213003130041300513006130071300813009130101301113012130131301413015130161301713018130191302013021130221302313024130251302613027130281302913030130311303213033130341303513036130371303813039130401304113042130431304413045130461304713048130491305013051130521305313054130551305613057130581305913060130611306213063130641306513066130671306813069130701307113072130731307413075130761307713078130791308013081130821308313084130851308613087130881308913090130911309213093130941309513096130971309813099131001310113102131031310413105131061310713108131091311013111131121311313114131151311613117131181311913120131211312213123131241312513126131271312813129131301313113132131331313413135131361313713138131391314013141131421314313144131451314613147131481314913150131511315213153131541315513156131571315813159131601316113162131631316413165131661316713168131691317013171131721317313174131751317613177131781317913180131811318213183131841318513186131871318813189131901319113192131931319413195131961319713198131991320013201132021320313204132051320613207132081320913210132111321213213132141321513216132171321813219132201322113222132231322413225132261322713228132291323013231132321323313234132351323613237132381323913240132411324213243132441324513246132471324813249132501325113252132531325413255132561325713258132591326013261132621326313264132651326613267132681326913270132711327213273132741327513276132771327813279132801328113282132831328413285132861328713288132891329013291132921329313294132951329613297132981329913300133011330213303133041330513306133071330813309133101331113312133131331413315133161331713318133191332013321133221332313324133251332613327133281332913330133311333213333133341333513336133371333813339133401334113342133431334413345133461334713348133491335013351133521335313354133551335613357133581335913360133611336213363133641336513366133671336813369133701337113372133731337413375133761337713378133791338013381133821338313384133851338613387133881338913390133911339213393133941339513396133971339813399134001340113402134031340413405134061340713408134091341013411134121341313414134151341613417134181341913420134211342213423134241342513426134271342813429134301343113432134331343413435134361343713438134391344013441134421344313444134451344613447134481344913450134511345213453134541345513456134571345813459134601346113462134631346413465134661346713468134691347013471134721347313474134751347613477134781347913480134811348213483134841348513486134871348813489134901349113492134931349413495134961349713498134991350013501135021350313504135051350613507135081350913510135111351213513135141351513516135171351813519135201352113522135231352413525135261352713528135291353013531135321353313534135351353613537135381353913540135411354213543135441354513546135471354813549135501355113552135531355413555135561355713558135591356013561135621356313564135651356613567135681356913570135711357213573135741357513576135771357813579135801358113582135831358413585135861358713588135891359013591135921359313594135951359613597135981359913600136011360213603136041360513606136071360813609136101361113612136131361413615136161361713618136191362013621136221362313624136251362613627136281362913630136311363213633136341363513636136371363813639136401364113642136431364413645136461364713648136491365013651136521365313654136551365613657136581365913660136611366213663136641366513666136671366813669136701367113672136731367413675136761367713678136791368013681136821368313684136851368613687136881368913690136911369213693136941369513696136971369813699137001370113702137031370413705137061370713708137091371013711137121371313714137151371613717137181371913720137211372213723137241372513726137271372813729137301373113732137331373413735137361373713738137391374013741137421374313744137451374613747137481374913750137511375213753137541375513756137571375813759137601376113762137631376413765137661376713768137691377013771137721377313774137751377613777137781377913780137811378213783137841378513786137871378813789137901379113792137931379413795137961379713798137991380013801138021380313804138051380613807138081380913810138111381213813138141381513816138171381813819138201382113822138231382413825138261382713828138291383013831138321383313834138351383613837138381383913840138411384213843138441384513846138471384813849138501385113852138531385413855138561385713858138591386013861138621386313864138651386613867138681386913870138711387213873138741387513876138771387813879138801388113882138831388413885138861388713888138891389013891138921389313894138951389613897138981389913900139011390213903139041390513906139071390813909139101391113912139131391413915139161391713918139191392013921139221392313924139251392613927139281392913930139311393213933139341393513936139371393813939139401394113942139431394413945139461394713948139491395013951139521395313954139551395613957139581395913960139611396213963139641396513966139671396813969139701397113972139731397413975139761397713978139791398013981139821398313984139851398613987139881398913990139911399213993139941399513996139971399813999140001400114002140031400414005140061400714008140091401014011140121401314014140151401614017140181401914020140211402214023140241402514026140271402814029140301403114032140331403414035140361403714038140391404014041140421404314044140451404614047140481404914050140511405214053140541405514056140571405814059140601406114062140631406414065140661406714068140691407014071140721407314074140751407614077140781407914080140811408214083140841408514086140871408814089140901409114092140931409414095140961409714098140991410014101141021410314104141051410614107141081410914110141111411214113141141411514116141171411814119141201412114122141231412414125141261412714128141291413014131141321413314134141351413614137141381413914140141411414214143141441414514146141471414814149141501415114152141531415414155141561415714158141591416014161141621416314164141651416614167141681416914170141711417214173141741417514176141771417814179141801418114182141831418414185141861418714188141891419014191141921419314194141951419614197141981419914200142011420214203142041420514206142071420814209142101421114212142131421414215142161421714218142191422014221142221422314224142251422614227142281422914230142311423214233142341423514236142371423814239142401424114242142431424414245142461424714248142491425014251142521425314254142551425614257142581425914260142611426214263142641426514266142671426814269142701427114272142731427414275142761427714278142791428014281142821428314284142851428614287142881428914290142911429214293142941429514296142971429814299143001430114302143031430414305143061430714308143091431014311143121431314314143151431614317143181431914320143211432214323143241432514326143271432814329143301433114332143331433414335143361433714338143391434014341143421434314344143451434614347143481434914350143511435214353143541435514356143571435814359143601436114362143631436414365143661436714368143691437014371143721437314374143751437614377143781437914380143811438214383143841438514386143871438814389143901439114392143931439414395143961439714398143991440014401144021440314404144051440614407144081440914410144111441214413144141441514416144171441814419144201442114422144231442414425144261442714428144291443014431144321443314434144351443614437144381443914440144411444214443144441444514446144471444814449144501445114452144531445414455144561445714458144591446014461144621446314464144651446614467144681446914470144711447214473144741447514476144771447814479144801448114482144831448414485144861448714488144891449014491144921449314494144951449614497144981449914500145011450214503145041450514506145071450814509145101451114512145131451414515145161451714518145191452014521145221452314524145251452614527145281452914530145311453214533145341453514536145371453814539145401454114542145431454414545145461454714548145491455014551145521455314554145551455614557145581455914560145611456214563145641456514566145671456814569145701457114572145731457414575145761457714578145791458014581145821458314584145851458614587145881458914590145911459214593145941459514596145971459814599146001460114602146031460414605146061460714608146091461014611146121461314614146151461614617146181461914620146211462214623146241462514626146271462814629146301463114632146331463414635146361463714638146391464014641146421464314644146451464614647146481464914650146511465214653146541465514656146571465814659146601466114662146631466414665146661466714668146691467014671146721467314674146751467614677146781467914680146811468214683146841468514686146871468814689146901469114692146931469414695146961469714698146991470014701147021470314704147051470614707147081470914710147111471214713147141471514716147171471814719147201472114722147231472414725147261472714728147291473014731147321473314734147351473614737147381473914740147411474214743147441474514746147471474814749147501475114752147531475414755147561475714758147591476014761147621476314764147651476614767147681476914770147711477214773147741477514776147771477814779147801478114782147831478414785147861478714788147891479014791147921479314794147951479614797147981479914800148011480214803148041480514806148071480814809148101481114812148131481414815148161481714818148191482014821148221482314824148251482614827148281482914830148311483214833148341483514836148371483814839148401484114842148431484414845148461484714848148491485014851148521485314854148551485614857148581485914860148611486214863148641486514866148671486814869148701487114872148731487414875148761487714878148791488014881148821488314884148851488614887148881488914890148911489214893148941489514896148971489814899149001490114902149031490414905149061490714908149091491014911149121491314914149151491614917149181491914920149211492214923149241492514926149271492814929149301493114932149331493414935149361493714938149391494014941149421494314944149451494614947149481494914950149511495214953149541495514956149571495814959149601496114962149631496414965149661496714968149691497014971149721497314974149751497614977149781497914980149811498214983149841498514986149871498814989149901499114992149931499414995149961499714998149991500015001150021500315004150051500615007150081500915010150111501215013150141501515016150171501815019150201502115022150231502415025150261502715028150291503015031150321503315034150351503615037150381503915040150411504215043150441504515046150471504815049150501505115052150531505415055150561505715058150591506015061150621506315064150651506615067150681506915070150711507215073150741507515076150771507815079150801508115082150831508415085150861508715088150891509015091150921509315094150951509615097150981509915100151011510215103151041510515106151071510815109151101511115112151131511415115151161511715118151191512015121151221512315124151251512615127151281512915130151311513215133151341513515136151371513815139151401514115142151431514415145151461514715148151491515015151151521515315154151551515615157151581515915160151611516215163151641516515166151671516815169151701517115172151731517415175151761517715178151791518015181151821518315184151851518615187151881518915190151911519215193151941519515196151971519815199152001520115202152031520415205152061520715208152091521015211152121521315214152151521615217152181521915220152211522215223152241522515226152271522815229152301523115232152331523415235152361523715238152391524015241152421524315244152451524615247152481524915250152511525215253152541525515256152571525815259152601526115262152631526415265152661526715268152691527015271152721527315274152751527615277152781527915280152811528215283152841528515286152871528815289152901529115292152931529415295152961529715298152991530015301153021530315304153051530615307153081530915310153111531215313153141531515316153171531815319153201532115322153231532415325153261532715328153291533015331153321533315334153351533615337153381533915340153411534215343153441534515346153471534815349153501535115352153531535415355153561535715358153591536015361153621536315364153651536615367153681536915370153711537215373153741537515376153771537815379153801538115382153831538415385153861538715388153891539015391153921539315394153951539615397153981539915400154011540215403154041540515406154071540815409154101541115412154131541415415154161541715418154191542015421154221542315424154251542615427154281542915430154311543215433154341543515436154371543815439154401544115442154431544415445154461544715448154491545015451154521545315454154551545615457154581545915460154611546215463154641546515466154671546815469154701547115472154731547415475154761547715478154791548015481154821548315484154851548615487154881548915490154911549215493154941549515496154971549815499155001550115502155031550415505155061550715508155091551015511155121551315514155151551615517155181551915520155211552215523155241552515526155271552815529155301553115532155331553415535155361553715538155391554015541155421554315544155451554615547155481554915550155511555215553155541555515556155571555815559155601556115562155631556415565155661556715568155691557015571155721557315574155751557615577155781557915580155811558215583155841558515586155871558815589155901559115592155931559415595155961559715598155991560015601156021560315604156051560615607156081560915610156111561215613156141561515616156171561815619156201562115622156231562415625156261562715628156291563015631156321563315634156351563615637156381563915640156411564215643156441564515646156471564815649156501565115652156531565415655156561565715658156591566015661156621566315664156651566615667156681566915670156711567215673156741567515676156771567815679156801568115682156831568415685156861568715688156891569015691156921569315694156951569615697156981569915700157011570215703157041570515706157071570815709157101571115712157131571415715157161571715718157191572015721157221572315724157251572615727157281572915730157311573215733157341573515736157371573815739157401574115742157431574415745157461574715748157491575015751157521575315754157551575615757157581575915760157611576215763157641576515766157671576815769157701577115772157731577415775157761577715778157791578015781157821578315784157851578615787157881578915790157911579215793157941579515796157971579815799158001580115802158031580415805158061580715808158091581015811158121581315814158151581615817158181581915820158211582215823158241582515826158271582815829158301583115832158331583415835158361583715838158391584015841158421584315844158451584615847158481584915850158511585215853158541585515856158571585815859158601586115862158631586415865158661586715868158691587015871158721587315874158751587615877158781587915880158811588215883158841588515886158871588815889158901589115892158931589415895158961589715898158991590015901159021590315904159051590615907159081590915910159111591215913159141591515916159171591815919159201592115922159231592415925159261592715928159291593015931159321593315934159351593615937159381593915940159411594215943159441594515946159471594815949159501595115952159531595415955159561595715958159591596015961159621596315964159651596615967159681596915970159711597215973159741597515976159771597815979159801598115982159831598415985159861598715988159891599015991159921599315994159951599615997159981599916000160011600216003160041600516006160071600816009160101601116012160131601416015160161601716018160191602016021160221602316024160251602616027160281602916030160311603216033160341603516036160371603816039160401604116042160431604416045160461604716048160491605016051160521605316054160551605616057160581605916060160611606216063160641606516066160671606816069160701607116072160731607416075160761607716078160791608016081160821608316084160851608616087160881608916090160911609216093160941609516096160971609816099161001610116102161031610416105161061610716108161091611016111161121611316114161151611616117161181611916120161211612216123161241612516126161271612816129161301613116132161331613416135161361613716138161391614016141161421614316144161451614616147161481614916150161511615216153161541615516156161571615816159161601616116162161631616416165161661616716168161691617016171161721617316174161751617616177161781617916180161811618216183161841618516186161871618816189161901619116192161931619416195161961619716198161991620016201162021620316204162051620616207162081620916210162111621216213162141621516216162171621816219162201622116222162231622416225162261622716228162291623016231162321623316234162351623616237162381623916240162411624216243162441624516246162471624816249162501625116252162531625416255162561625716258162591626016261162621626316264162651626616267162681626916270162711627216273162741627516276162771627816279162801628116282162831628416285162861628716288162891629016291162921629316294162951629616297162981629916300163011630216303163041630516306163071630816309163101631116312163131631416315163161631716318163191632016321163221632316324163251632616327163281632916330163311633216333163341633516336163371633816339163401634116342163431634416345163461634716348163491635016351163521635316354163551635616357163581635916360163611636216363163641636516366163671636816369163701637116372163731637416375163761637716378163791638016381163821638316384163851638616387163881638916390163911639216393163941639516396163971639816399164001640116402164031640416405164061640716408164091641016411164121641316414164151641616417164181641916420164211642216423164241642516426164271642816429164301643116432164331643416435164361643716438164391644016441164421644316444164451644616447164481644916450164511645216453164541645516456164571645816459164601646116462164631646416465164661646716468164691647016471164721647316474164751647616477164781647916480164811648216483164841648516486164871648816489164901649116492164931649416495164961649716498164991650016501165021650316504165051650616507165081650916510165111651216513165141651516516165171651816519165201652116522165231652416525165261652716528165291653016531165321653316534165351653616537165381653916540165411654216543165441654516546165471654816549165501655116552165531655416555165561655716558165591656016561165621656316564165651656616567165681656916570165711657216573165741657516576165771657816579165801658116582165831658416585165861658716588165891659016591165921659316594165951659616597165981659916600166011660216603166041660516606166071660816609166101661116612166131661416615166161661716618166191662016621166221662316624166251662616627166281662916630166311663216633166341663516636166371663816639166401664116642166431664416645166461664716648166491665016651166521665316654166551665616657166581665916660166611666216663166641666516666166671666816669166701667116672166731667416675166761667716678166791668016681166821668316684166851668616687166881668916690166911669216693166941669516696166971669816699167001670116702167031670416705167061670716708167091671016711167121671316714167151671616717167181671916720167211672216723167241672516726167271672816729167301673116732167331673416735167361673716738167391674016741167421674316744167451674616747167481674916750167511675216753167541675516756167571675816759167601676116762167631676416765167661676716768167691677016771167721677316774167751677616777167781677916780167811678216783167841678516786167871678816789167901679116792167931679416795167961679716798167991680016801168021680316804168051680616807168081680916810168111681216813168141681516816168171681816819168201682116822168231682416825168261682716828168291683016831168321683316834168351683616837168381683916840168411684216843168441684516846168471684816849168501685116852168531685416855168561685716858168591686016861168621686316864168651686616867168681686916870168711687216873168741687516876168771687816879168801688116882168831688416885168861688716888168891689016891168921689316894168951689616897168981689916900169011690216903169041690516906169071690816909169101691116912169131691416915169161691716918169191692016921169221692316924169251692616927169281692916930169311693216933169341693516936169371693816939169401694116942169431694416945169461694716948169491695016951169521695316954169551695616957169581695916960169611696216963169641696516966169671696816969169701697116972169731697416975169761697716978169791698016981169821698316984169851698616987169881698916990169911699216993169941699516996169971699816999170001700117002170031700417005170061700717008170091701017011170121701317014170151701617017170181701917020170211702217023170241702517026170271702817029170301703117032170331703417035170361703717038170391704017041170421704317044170451704617047170481704917050170511705217053170541705517056170571705817059170601706117062170631706417065170661706717068170691707017071170721707317074170751707617077170781707917080170811708217083170841708517086170871708817089170901709117092170931709417095170961709717098170991710017101171021710317104171051710617107171081710917110171111711217113171141711517116171171711817119171201712117122171231712417125171261712717128171291713017131171321713317134171351713617137171381713917140171411714217143171441714517146171471714817149171501715117152171531715417155171561715717158171591716017161171621716317164171651716617167171681716917170171711717217173171741717517176171771717817179171801718117182171831718417185171861718717188171891719017191171921719317194171951719617197171981719917200172011720217203172041720517206172071720817209172101721117212172131721417215172161721717218172191722017221172221722317224172251722617227172281722917230172311723217233172341723517236172371723817239172401724117242172431724417245172461724717248172491725017251172521725317254172551725617257172581725917260172611726217263172641726517266172671726817269172701727117272172731727417275172761727717278172791728017281172821728317284172851728617287172881728917290172911729217293172941729517296172971729817299173001730117302173031730417305173061730717308173091731017311173121731317314173151731617317173181731917320173211732217323173241732517326173271732817329173301733117332173331733417335173361733717338173391734017341173421734317344173451734617347173481734917350173511735217353173541735517356173571735817359173601736117362173631736417365173661736717368173691737017371173721737317374173751737617377173781737917380173811738217383173841738517386173871738817389173901739117392173931739417395173961739717398173991740017401174021740317404174051740617407174081740917410174111741217413174141741517416174171741817419174201742117422174231742417425174261742717428174291743017431174321743317434174351743617437174381743917440174411744217443174441744517446174471744817449174501745117452174531745417455174561745717458174591746017461174621746317464174651746617467174681746917470174711747217473174741747517476174771747817479174801748117482174831748417485174861748717488174891749017491174921749317494174951749617497174981749917500175011750217503175041750517506175071750817509175101751117512175131751417515175161751717518175191752017521175221752317524175251752617527175281752917530175311753217533175341753517536175371753817539175401754117542175431754417545175461754717548175491755017551175521755317554175551755617557175581755917560175611756217563175641756517566175671756817569175701757117572175731757417575175761757717578175791758017581175821758317584175851758617587175881758917590175911759217593175941759517596175971759817599176001760117602176031760417605176061760717608176091761017611176121761317614176151761617617176181761917620176211762217623176241762517626176271762817629176301763117632176331763417635176361763717638176391764017641176421764317644176451764617647176481764917650176511765217653176541765517656176571765817659176601766117662176631766417665176661766717668176691767017671176721767317674176751767617677176781767917680176811768217683176841768517686176871768817689176901769117692176931769417695176961769717698176991770017701177021770317704177051770617707177081770917710177111771217713177141771517716177171771817719177201772117722177231772417725177261772717728177291773017731177321773317734177351773617737177381773917740177411774217743177441774517746177471774817749177501775117752177531775417755177561775717758177591776017761
  1. #include "llama-model.h"
  2. #include "llama-impl.h"
  3. #include "llama-mmap.h"
  4. #include "llama-batch.h"
  5. #include "llama-cparams.h"
  6. #include "llama-model-loader.h"
  7. #include "llama-kv-cache-unified.h"
  8. #include "llama-kv-cache-unified-iswa.h"
  9. #include "llama-memory-hybrid.h"
  10. #include "llama-memory-recurrent.h"
  11. #include "ggml-cpp.h"
  12. #include <algorithm>
  13. #include <cassert>
  14. #include <cmath>
  15. #include <cfloat>
  16. #include <cstring>
  17. #include <cmath>
  18. #include <functional>
  19. #include <map>
  20. #include <regex>
  21. #include <sstream>
  22. #include <stdexcept>
  23. const char * llm_type_name(llm_type type) {
  24. switch (type) {
  25. case LLM_TYPE_14M: return "14M";
  26. case LLM_TYPE_17M: return "17M";
  27. case LLM_TYPE_22M: return "22M";
  28. case LLM_TYPE_33M: return "33M";
  29. case LLM_TYPE_60M: return "60M";
  30. case LLM_TYPE_70M: return "70M";
  31. case LLM_TYPE_80M: return "80M";
  32. case LLM_TYPE_109M: return "109M";
  33. case LLM_TYPE_137M: return "137M";
  34. case LLM_TYPE_160M: return "160M";
  35. case LLM_TYPE_190M: return "190M";
  36. case LLM_TYPE_220M: return "220M";
  37. case LLM_TYPE_250M: return "250M";
  38. case LLM_TYPE_256M: return "256M";
  39. case LLM_TYPE_270M: return "270M";
  40. case LLM_TYPE_335M: return "335M";
  41. case LLM_TYPE_350M: return "350M";
  42. case LLM_TYPE_410M: return "410M";
  43. case LLM_TYPE_450M: return "450M";
  44. case LLM_TYPE_475M: return "475M";
  45. case LLM_TYPE_700M: return "700M";
  46. case LLM_TYPE_770M: return "770M";
  47. case LLM_TYPE_780M: return "780M";
  48. case LLM_TYPE_0_3B: return "0.3B";
  49. case LLM_TYPE_0_5B: return "0.5B";
  50. case LLM_TYPE_0_6B: return "0.6B";
  51. case LLM_TYPE_1B: return "1B";
  52. case LLM_TYPE_1_2B: return "1.2B";
  53. case LLM_TYPE_1_3B: return "1.3B";
  54. case LLM_TYPE_1_4B: return "1.4B";
  55. case LLM_TYPE_1_5B: return "1.5B";
  56. case LLM_TYPE_1_6B: return "1.6B";
  57. case LLM_TYPE_1_7B: return "1.7B";
  58. case LLM_TYPE_1_8B: return "1.8B";
  59. case LLM_TYPE_2B: return "2B";
  60. case LLM_TYPE_2_8B: return "2.8B";
  61. case LLM_TYPE_2_9B: return "2.9B";
  62. case LLM_TYPE_3B: return "3B";
  63. case LLM_TYPE_4B: return "4B";
  64. case LLM_TYPE_6B: return "6B";
  65. case LLM_TYPE_6_9B: return "6.9B";
  66. case LLM_TYPE_7B: return "7B";
  67. case LLM_TYPE_8B: return "8B";
  68. case LLM_TYPE_9B: return "9B";
  69. case LLM_TYPE_11B: return "11B";
  70. case LLM_TYPE_12B: return "12B";
  71. case LLM_TYPE_13B: return "13B";
  72. case LLM_TYPE_14B: return "14B";
  73. case LLM_TYPE_15B: return "15B";
  74. case LLM_TYPE_16B: return "16B";
  75. case LLM_TYPE_20B: return "20B";
  76. case LLM_TYPE_27B: return "27B";
  77. case LLM_TYPE_30B: return "30B";
  78. case LLM_TYPE_32B: return "32B";
  79. case LLM_TYPE_34B: return "34B";
  80. case LLM_TYPE_35B: return "35B";
  81. case LLM_TYPE_40B: return "40B";
  82. case LLM_TYPE_65B: return "65B";
  83. case LLM_TYPE_70B: return "70B";
  84. case LLM_TYPE_142B: return "142B";
  85. case LLM_TYPE_236B: return "236B";
  86. case LLM_TYPE_290B: return "290B";
  87. case LLM_TYPE_314B: return "314B";
  88. case LLM_TYPE_405B: return "405B";
  89. case LLM_TYPE_671B: return "671B";
  90. case LLM_TYPE_SMALL: return "0.1B";
  91. case LLM_TYPE_MEDIUM: return "0.4B";
  92. case LLM_TYPE_LARGE: return "0.8B";
  93. case LLM_TYPE_XL: return "1.5B";
  94. case LLM_TYPE_A1_7B: return "A1.7B";
  95. case LLM_TYPE_A2_7B: return "A2.7B";
  96. case LLM_TYPE_8x7B: return "8x7B";
  97. case LLM_TYPE_8x22B: return "8x22B";
  98. case LLM_TYPE_16x12B: return "16x12B";
  99. case LLM_TYPE_16x3_8B: return "16x3.8B";
  100. case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
  101. case LLM_TYPE_57B_A14B: return "57B.A14B";
  102. case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
  103. case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
  104. case LLM_TYPE_A13B: return "A13B";
  105. case LLM_TYPE_21B_A3B: return "21B.A3B";
  106. case LLM_TYPE_30B_A3B: return "30B.A3B";
  107. case LLM_TYPE_235B_A22B: return "235B.A22B";
  108. case LLM_TYPE_300B_A47B: return "300B.A47B";
  109. case LLM_TYPE_E2B: return "E2B";
  110. case LLM_TYPE_E4B: return "E4B";
  111. default: return "?B";
  112. }
  113. }
  114. static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
  115. switch (type) {
  116. case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
  117. case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
  118. default: return "unknown";
  119. }
  120. }
  121. static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
  122. { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
  123. { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
  124. { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
  125. { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
  126. };
  127. std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
  128. return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
  129. }
  130. static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
  131. for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
  132. if (kv.second == name) {
  133. return (llama_rope_scaling_type) kv.first;
  134. }
  135. }
  136. return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
  137. }
  138. // checks if the weight tensor can be used with the specified buffer type and device
  139. static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
  140. GGML_ASSERT(w != nullptr);
  141. if (op == GGML_OP_NONE) {
  142. return true;
  143. }
  144. ggml_init_params params = {
  145. /*.mem_size =*/ ggml_tensor_overhead()*8,
  146. /*.mem_buffer =*/ NULL,
  147. /*.no_alloc =*/ true,
  148. };
  149. ggml_context_ptr ctx_ptr { ggml_init(params) };
  150. if (!ctx_ptr) {
  151. throw std::runtime_error(format("failed to create ggml context"));
  152. }
  153. ggml_context * ctx = ctx_ptr.get();
  154. ggml_tensor * op_tensor = nullptr;
  155. switch (op) {
  156. case GGML_OP_GET_ROWS:
  157. {
  158. ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
  159. op_tensor = ggml_get_rows(ctx, w, b);
  160. } break;
  161. case GGML_OP_MUL_MAT:
  162. {
  163. ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
  164. op_tensor = ggml_mul_mat(ctx, w, b);
  165. } break;
  166. case GGML_OP_MUL_MAT_ID:
  167. {
  168. int n_expert_used = hparams.n_expert_used;
  169. ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
  170. ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
  171. op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
  172. } break;
  173. case GGML_OP_ADD:
  174. {
  175. ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
  176. op_tensor = ggml_add(ctx, a, w);
  177. } break;
  178. case GGML_OP_MUL:
  179. {
  180. ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
  181. op_tensor = ggml_mul(ctx, a, w);
  182. } break;
  183. case GGML_OP_DIV:
  184. {
  185. ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
  186. op_tensor = ggml_div(ctx, a, w);
  187. } break;
  188. case GGML_OP_ROPE:
  189. {
  190. int n_embd_head = hparams.n_embd_head_v;
  191. int n_head = hparams.n_head();
  192. ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
  193. ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
  194. op_tensor = ggml_rope_ext(
  195. ctx, a, b, w,
  196. 0, 0, 0, 0, 0,
  197. 0, 0, 0, 0
  198. );
  199. } break;
  200. case GGML_OP_SSM_CONV:
  201. {
  202. const int64_t n_seq_tokens = 512;
  203. const int64_t n_seqs = 3;
  204. ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
  205. op_tensor = ggml_ssm_conv(ctx, conv_x, w);
  206. } break;
  207. case GGML_OP_SSM_SCAN:
  208. {
  209. // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
  210. const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
  211. const int64_t n_head = w->ne[1];
  212. const int64_t head_dim = hparams.ssm_d_inner / n_head;
  213. const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
  214. const int64_t n_seq_tokens = 512;
  215. const int64_t n_seqs = 3;
  216. ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
  217. ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
  218. ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
  219. ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
  220. ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
  221. ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
  222. op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
  223. } break;
  224. case GGML_OP_RWKV_WKV6:
  225. {
  226. // FIXME
  227. const int64_t S = 123;
  228. const int64_t H = 123;
  229. const int64_t n_tokens = 123;
  230. const int64_t n_seqs = 123;
  231. ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  232. ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  233. ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  234. ggml_tensor * tf = w;
  235. ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  236. ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
  237. op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
  238. } break;
  239. case GGML_OP_IM2COL:
  240. {
  241. const int n_embd = hparams.n_embd;
  242. ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
  243. op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
  244. } break;
  245. default:
  246. GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
  247. }
  248. // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
  249. GGML_ASSERT(w->buffer == nullptr);
  250. w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
  251. bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  252. ggml_backend_buffer_free(w->buffer);
  253. w->buffer = nullptr;
  254. return op_supported;
  255. }
  256. // lists of buffer types used for each layer
  257. using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
  258. // find the first buffer type in the list that can use the tensor
  259. static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
  260. GGML_ASSERT(!buft_list.empty());
  261. for (const auto & cur : buft_list) {
  262. ggml_backend_dev_t cur_dev = cur.first;
  263. ggml_backend_buffer_type_t cur_buft = cur.second;
  264. if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
  265. return cur_buft;
  266. }
  267. }
  268. return nullptr;
  269. }
  270. // CPU: ACCEL -> GPU host -> CPU extra -> CPU
  271. static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
  272. buft_list_t buft_list;
  273. // add ACCEL buffer types
  274. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  275. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  276. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
  277. auto * buft = ggml_backend_dev_buffer_type(dev);
  278. // skip
  279. if (buft != ggml_backend_cpu_buffer_type()) {
  280. buft_list.emplace_back(dev, buft);
  281. }
  282. }
  283. }
  284. // add a host buffer type
  285. // storing the tensors in a host buffer is useful when the processing of large batches
  286. // is offloaded to a GPU device, since it reduces the time spent on data transfers
  287. // generally, this will be done using the first device in the list
  288. // a better approach would be to handle this on a weight-by-weight basis using the offload_op
  289. // function of the device to determine if it would benefit from being stored in a host buffer
  290. for (auto * dev : devices) {
  291. ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
  292. if (buft) {
  293. buft_list.emplace_back(dev, buft);
  294. break;
  295. }
  296. }
  297. // add extra buffer types, only if no GPU device is present
  298. // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
  299. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  300. if (cpu_dev == nullptr) {
  301. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  302. }
  303. auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
  304. auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
  305. ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
  306. if (ggml_backend_dev_get_extra_bufts_fn) {
  307. ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
  308. while (extra_bufts && *extra_bufts) {
  309. buft_list.emplace_back(cpu_dev, *extra_bufts);
  310. ++extra_bufts;
  311. }
  312. }
  313. // add the CPU buffer type
  314. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  315. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  316. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
  317. buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
  318. }
  319. }
  320. return buft_list;
  321. }
  322. // GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
  323. static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode split_mode, const float * tensor_split) {
  324. buft_list_t buft_list;
  325. // add the device split buffer type if requested and available
  326. if (split_mode == LLAMA_SPLIT_MODE_ROW) {
  327. ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
  328. auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
  329. ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
  330. if (ggml_backend_split_buffer_type_fn) {
  331. size_t dev_index = [&]() {
  332. auto * reg = ggml_backend_dev_backend_reg(dev);
  333. for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
  334. if (ggml_backend_reg_dev_get(reg, i) == dev) {
  335. return i;
  336. }
  337. }
  338. throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
  339. }();
  340. auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
  341. if (buft != nullptr) {
  342. buft_list.emplace_back(dev, buft);
  343. }
  344. }
  345. }
  346. // add the device default buffer type
  347. buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
  348. return buft_list;
  349. }
  350. struct llama_model::impl {
  351. impl() {}
  352. ~impl() {}
  353. uint64_t n_elements = 0;
  354. size_t n_bytes = 0;
  355. std::string desc_str;
  356. // model memory mapped files
  357. llama_mmaps mappings;
  358. // objects representing data potentially being locked in memory
  359. llama_mlocks mlock_bufs;
  360. llama_mlocks mlock_mmaps;
  361. // contexts where the model tensors metadata is stored
  362. std::vector<ggml_context_ptr> ctxs;
  363. // the model memory buffers for the tensor data
  364. std::vector<ggml_backend_buffer_ptr> bufs;
  365. buft_list_t cpu_buft_list;
  366. std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
  367. struct layer_dev {
  368. ggml_backend_dev_t dev;
  369. buft_list_t * buft_list;
  370. };
  371. layer_dev dev_input = {};
  372. layer_dev dev_output = {};
  373. std::vector<layer_dev> dev_layer;
  374. bool has_tensor_overrides;
  375. };
  376. llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
  377. pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
  378. }
  379. llama_model::~llama_model() {}
  380. void llama_model::load_stats(llama_model_loader & ml) {
  381. pimpl->n_elements = ml.n_elements;
  382. pimpl->n_bytes = ml.n_bytes;
  383. }
  384. void llama_model::load_arch(llama_model_loader & ml) {
  385. arch = ml.get_arch();
  386. if (arch == LLM_ARCH_UNKNOWN) {
  387. throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
  388. }
  389. }
  390. void llama_model::load_hparams(llama_model_loader & ml) {
  391. const gguf_context * ctx = ml.meta.get();
  392. // get metadata as string
  393. for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
  394. gguf_type type = gguf_get_kv_type(ctx, i);
  395. if (type == GGUF_TYPE_ARRAY) {
  396. continue;
  397. }
  398. const char * name = gguf_get_key(ctx, i);
  399. const std::string value = gguf_kv_to_str(ctx, i);
  400. gguf_kv.emplace(name, value);
  401. }
  402. // get general kv
  403. ml.get_key(LLM_KV_GENERAL_NAME, name, false);
  404. // everything past this point is not vocab-related
  405. if (hparams.vocab_only) {
  406. return;
  407. }
  408. ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
  409. ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
  410. ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
  411. ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
  412. ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
  413. if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
  414. ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
  415. ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
  416. ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
  417. ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
  418. ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
  419. }
  420. GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
  421. GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
  422. if (hparams.n_expert > 0) {
  423. GGML_ASSERT(hparams.n_expert_used > 0);
  424. } else {
  425. GGML_ASSERT(hparams.n_expert_used == 0);
  426. }
  427. std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
  428. std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
  429. std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
  430. std::fill(
  431. hparams.recurrent_layer_arr.begin(),
  432. hparams.recurrent_layer_arr.end(),
  433. llm_arch_is_recurrent(ml.get_arch()));
  434. std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
  435. std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
  436. ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
  437. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
  438. // n_head_kv is optional, default to n_head
  439. hparams.n_head_kv_arr = hparams.n_head_arr;
  440. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
  441. bool rope_finetuned = false;
  442. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  443. hparams.rope_finetuned = rope_finetuned;
  444. hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
  445. ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
  446. // rope_freq_base (optional)
  447. hparams.rope_freq_base_train = 10000.0f;
  448. ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
  449. std::string rope_scaling("linear");
  450. ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
  451. hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
  452. GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
  453. // rope_freq_scale (inverse of the kv) is optional
  454. float ropescale = 0.0f;
  455. if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
  456. // try the old key name
  457. ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
  458. }
  459. hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
  460. // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
  461. hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
  462. hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
  463. ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
  464. // non-transformer models do not have attention heads
  465. if (hparams.n_head() > 0) {
  466. // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
  467. // gpt-j n_rot = rotary_dim
  468. hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
  469. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
  470. hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
  471. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
  472. // sanity check for n_rot (optional)
  473. hparams.n_rot = hparams.n_embd_head_k;
  474. ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
  475. if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
  476. if (hparams.n_rot != hparams.n_embd_head_k) {
  477. throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
  478. }
  479. }
  480. } else {
  481. hparams.n_rot = 0;
  482. hparams.n_embd_head_k = 0;
  483. hparams.n_embd_head_v = 0;
  484. }
  485. // for differentiating model types
  486. uint32_t n_vocab = 0;
  487. ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
  488. // for classifier models
  489. ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
  490. if (!classifier_labels.empty()) {
  491. hparams.n_cls_out = classifier_labels.size();
  492. }
  493. // arch-specific KVs
  494. switch (arch) {
  495. case LLM_ARCH_LLAMA:
  496. {
  497. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  498. if (hparams.n_expert == 8) {
  499. switch (hparams.n_layer) {
  500. case 32: type = LLM_TYPE_8x7B; break;
  501. case 56: type = LLM_TYPE_8x22B; break;
  502. default: type = LLM_TYPE_UNKNOWN;
  503. }
  504. } else {
  505. switch (hparams.n_layer) {
  506. case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
  507. case 22: type = LLM_TYPE_1B; break;
  508. case 26: type = LLM_TYPE_3B; break;
  509. case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
  510. case 30: type = LLM_TYPE_256M; break; // smoldocling 256M
  511. // granite uses a vocab with len 49152
  512. case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
  513. case 36: type = LLM_TYPE_8B; break; // granite
  514. case 40: type = LLM_TYPE_13B; break;
  515. case 48: type = LLM_TYPE_34B; break;
  516. case 60: type = LLM_TYPE_30B; break;
  517. case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
  518. default: type = LLM_TYPE_UNKNOWN;
  519. }
  520. }
  521. } break;
  522. case LLM_ARCH_LLAMA4:
  523. {
  524. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  525. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  526. ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
  527. hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
  528. hparams.n_swa = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
  529. hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
  530. switch (hparams.n_expert) {
  531. case 16: type = LLM_TYPE_17B_16E; break;
  532. case 128: type = LLM_TYPE_17B_128E; break;
  533. default: type = LLM_TYPE_UNKNOWN;
  534. }
  535. if (type == LLM_TYPE_17B_128E) {
  536. hparams.use_kq_norm = false;
  537. }
  538. } break;
  539. case LLM_ARCH_ARCEE:
  540. {
  541. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  542. // Arcee uses the same structure as Llama
  543. switch (hparams.n_layer) {
  544. case 36: type = LLM_TYPE_4B; break;
  545. default: type = LLM_TYPE_UNKNOWN;
  546. }
  547. } break;
  548. case LLM_ARCH_DECI:
  549. {
  550. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  551. switch (hparams.n_layer) {
  552. case 32: type = LLM_TYPE_7B; break;
  553. case 80: type = LLM_TYPE_70B; break;
  554. case 162: type = LLM_TYPE_405B; break;
  555. default: type = LLM_TYPE_UNKNOWN;
  556. }
  557. } break;
  558. case LLM_ARCH_MINICPM:
  559. {
  560. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  561. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  562. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  563. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  564. // MiniCPM uses rope by default, unlike Granite which uses it as a switch
  565. hparams.rope_finetuned = true;
  566. switch (hparams.n_layer) {
  567. case 52: type = LLM_TYPE_1B; break;
  568. case 40: type = LLM_TYPE_2B; break;
  569. default: type = LLM_TYPE_UNKNOWN;
  570. }
  571. } break;
  572. case LLM_ARCH_MINICPM3:
  573. {
  574. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  575. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  576. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  577. switch (hparams.n_layer) {
  578. case 62: type = LLM_TYPE_4B; break;
  579. default: type = LLM_TYPE_UNKNOWN;
  580. }
  581. } break;
  582. case LLM_ARCH_GROK:
  583. {
  584. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  585. switch (hparams.n_layer) {
  586. case 64: type = LLM_TYPE_314B; break;
  587. default: type = LLM_TYPE_UNKNOWN;
  588. }
  589. } break;
  590. case LLM_ARCH_FALCON:
  591. {
  592. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  593. switch (hparams.n_layer) {
  594. case 32: type = LLM_TYPE_7B; break;
  595. case 60: type = LLM_TYPE_40B; break;
  596. default: type = LLM_TYPE_UNKNOWN;
  597. }
  598. } break;
  599. case LLM_ARCH_BAICHUAN:
  600. {
  601. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  602. switch (hparams.n_layer) {
  603. case 32: type = LLM_TYPE_7B; break;
  604. case 40: type = LLM_TYPE_13B; break;
  605. default: type = LLM_TYPE_UNKNOWN;
  606. }
  607. if (type == LLM_TYPE_13B) {
  608. // TODO: become GGUF KV parameter
  609. hparams.f_max_alibi_bias = 8.0f;
  610. }
  611. } break;
  612. case LLM_ARCH_STARCODER:
  613. {
  614. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  615. switch (hparams.n_layer) {
  616. case 24: type = LLM_TYPE_1B; break;
  617. case 36: type = LLM_TYPE_3B; break;
  618. case 42: type = LLM_TYPE_7B; break;
  619. case 40: type = LLM_TYPE_15B; break;
  620. default: type = LLM_TYPE_UNKNOWN;
  621. }
  622. } break;
  623. case LLM_ARCH_REFACT:
  624. {
  625. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  626. switch (hparams.n_layer) {
  627. case 32: type = LLM_TYPE_1B; break;
  628. default: type = LLM_TYPE_UNKNOWN;
  629. }
  630. // TODO: become GGUF KV parameter
  631. hparams.f_max_alibi_bias = 8.0f;
  632. } break;
  633. case LLM_ARCH_BERT:
  634. {
  635. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  636. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  637. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  638. switch (hparams.n_layer) {
  639. case 3:
  640. type = LLM_TYPE_17M; break; // bge-micro
  641. case 6:
  642. type = LLM_TYPE_22M; break; // MiniLM-L6
  643. case 12:
  644. switch (hparams.n_embd) {
  645. case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
  646. case 768: type = LLM_TYPE_109M; break; // bge-base
  647. default: type = LLM_TYPE_UNKNOWN;
  648. } break;
  649. case 24:
  650. type = LLM_TYPE_335M; break; // bge-large
  651. default: type = LLM_TYPE_UNKNOWN;
  652. }
  653. } break;
  654. case LLM_ARCH_JINA_BERT_V2:
  655. {
  656. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  657. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  658. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  659. hparams.f_max_alibi_bias = 8.0f;
  660. switch (hparams.n_layer) {
  661. case 4: type = LLM_TYPE_33M; break; // jina-embeddings-small
  662. case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
  663. default: type = LLM_TYPE_UNKNOWN;
  664. }
  665. } break;
  666. case LLM_ARCH_NOMIC_BERT:
  667. case LLM_ARCH_NOMIC_BERT_MOE:
  668. {
  669. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  670. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  671. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  672. ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
  673. if (hparams.n_layer == 12 && hparams.n_embd == 768) {
  674. if (arch == LLM_ARCH_NOMIC_BERT) {
  675. type = LLM_TYPE_137M;
  676. } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
  677. type = LLM_TYPE_475M;
  678. }
  679. }
  680. } break;
  681. case LLM_ARCH_NEO_BERT:
  682. {
  683. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  684. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  685. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  686. if (hparams.n_layer == 28) {
  687. type = LLM_TYPE_250M;
  688. }
  689. } break;
  690. case LLM_ARCH_BLOOM:
  691. {
  692. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  693. switch (hparams.n_layer) {
  694. case 24: type = LLM_TYPE_1B; break;
  695. case 30:
  696. switch (hparams.n_embd) {
  697. case 2560: type = LLM_TYPE_3B; break;
  698. case 4096: type = LLM_TYPE_7B; break;
  699. default: type = LLM_TYPE_UNKNOWN;
  700. } break;
  701. default: type = LLM_TYPE_UNKNOWN;
  702. }
  703. // TODO: become GGUF KV parameter
  704. hparams.f_max_alibi_bias = 8.0f;
  705. } break;
  706. case LLM_ARCH_MPT:
  707. {
  708. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  709. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  710. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  711. switch (hparams.n_layer) {
  712. case 32: type = LLM_TYPE_7B; break;
  713. case 48: type = LLM_TYPE_30B; break;
  714. default: type = LLM_TYPE_UNKNOWN;
  715. }
  716. } break;
  717. case LLM_ARCH_STABLELM:
  718. {
  719. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  720. switch (hparams.n_layer) {
  721. case 24: type = LLM_TYPE_1B; break;
  722. case 32: type = LLM_TYPE_3B; break;
  723. case 40: type = LLM_TYPE_12B; break;
  724. default: type = LLM_TYPE_UNKNOWN;
  725. }
  726. } break;
  727. case LLM_ARCH_QWEN:
  728. {
  729. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  730. switch (hparams.n_layer) {
  731. case 32: type = LLM_TYPE_7B; break;
  732. case 40: type = LLM_TYPE_13B; break;
  733. default: type = LLM_TYPE_UNKNOWN;
  734. }
  735. } break;
  736. case LLM_ARCH_QWEN2VL:
  737. {
  738. ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
  739. }
  740. // fall through
  741. case LLM_ARCH_QWEN2:
  742. {
  743. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  744. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  745. switch (hparams.n_layer) {
  746. case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
  747. case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
  748. case 32: type = LLM_TYPE_7B; break;
  749. case 36: type = LLM_TYPE_3B; break;
  750. case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
  751. case 48: type = LLM_TYPE_14B; break;
  752. case 64: type = LLM_TYPE_32B; break;
  753. case 80: type = LLM_TYPE_70B; break;
  754. default: type = LLM_TYPE_UNKNOWN;
  755. }
  756. } break;
  757. case LLM_ARCH_DREAM:
  758. {
  759. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  760. // Dream models are primarily 7B with 28 layers
  761. switch (hparams.n_layer) {
  762. case 28:
  763. type = LLM_TYPE_7B;
  764. break;
  765. default:
  766. type = LLM_TYPE_UNKNOWN;
  767. }
  768. // Set non-causal attention for diffusion models
  769. hparams.causal_attn = false;
  770. }
  771. break;
  772. case LLM_ARCH_QWEN2MOE:
  773. {
  774. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  775. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  776. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  777. switch (hparams.n_layer) {
  778. case 24: type = LLM_TYPE_A2_7B; break;
  779. case 28: type = LLM_TYPE_57B_A14B; break;
  780. default: type = LLM_TYPE_UNKNOWN;
  781. }
  782. } break;
  783. case LLM_ARCH_QWEN3:
  784. {
  785. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  786. switch (hparams.n_layer) {
  787. case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
  788. case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
  789. case 40: type = LLM_TYPE_14B; break;
  790. case 64: type = LLM_TYPE_32B; break;
  791. default: type = LLM_TYPE_UNKNOWN;
  792. }
  793. } break;
  794. case LLM_ARCH_QWEN3MOE:
  795. {
  796. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  797. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  798. switch (hparams.n_layer) {
  799. case 48: type = LLM_TYPE_30B_A3B; break;
  800. case 94: type = LLM_TYPE_235B_A22B; break;
  801. default: type = LLM_TYPE_UNKNOWN;
  802. }
  803. } break;
  804. case LLM_ARCH_PHI2:
  805. {
  806. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  807. switch (hparams.n_layer) {
  808. case 24: type = LLM_TYPE_1B; break;
  809. case 32: type = LLM_TYPE_3B; break;
  810. default: type = LLM_TYPE_UNKNOWN;
  811. }
  812. } break;
  813. case LLM_ARCH_PHI3:
  814. {
  815. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  816. switch (hparams.n_layer) {
  817. case 24: type = LLM_TYPE_1B; break;
  818. case 32: type = LLM_TYPE_3B; break;
  819. case 40: type = LLM_TYPE_14B; break;
  820. default: type = LLM_TYPE_UNKNOWN;
  821. }
  822. const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  823. if (found_swa && hparams.n_swa > 0) {
  824. LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
  825. __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
  826. // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
  827. hparams.swa_type = LLAMA_SWA_TYPE_NONE;
  828. hparams.n_swa = 0;
  829. hparams.set_swa_pattern(1);
  830. }
  831. } break;
  832. case LLM_ARCH_PHIMOE:
  833. {
  834. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  835. switch (hparams.n_layer) {
  836. case 32: type = LLM_TYPE_16x3_8B; break;
  837. default: type = LLM_TYPE_UNKNOWN;
  838. }
  839. } break;
  840. case LLM_ARCH_PLAMO:
  841. {
  842. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  843. switch (hparams.n_layer) {
  844. case 40: type = LLM_TYPE_13B; break;
  845. default: type = LLM_TYPE_UNKNOWN;
  846. }
  847. } break;
  848. case LLM_ARCH_PLAMO2:
  849. {
  850. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  851. // Load Mamba SSM parameters
  852. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  853. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  854. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  855. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  856. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  857. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  858. hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
  859. }
  860. switch (hparams.n_layer) {
  861. case 16: type = LLM_TYPE_1B; break;
  862. case 32:
  863. if (hparams.n_embd == 2048) {
  864. type = LLM_TYPE_2B;
  865. } else if (hparams.n_embd == 4096) {
  866. type = LLM_TYPE_8B;
  867. }
  868. break;
  869. default: type = LLM_TYPE_UNKNOWN;
  870. }
  871. } break;
  872. case LLM_ARCH_GPT2:
  873. {
  874. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  875. switch (hparams.n_layer) {
  876. case 12: type = LLM_TYPE_SMALL; break;
  877. case 24: type = LLM_TYPE_MEDIUM; break;
  878. case 36: type = LLM_TYPE_LARGE; break;
  879. case 48: type = LLM_TYPE_XL; break;
  880. default: type = LLM_TYPE_UNKNOWN;
  881. }
  882. } break;
  883. case LLM_ARCH_CODESHELL:
  884. {
  885. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  886. switch (hparams.n_layer) {
  887. case 42: type = LLM_TYPE_7B; break;
  888. default: type = LLM_TYPE_UNKNOWN;
  889. }
  890. } break;
  891. case LLM_ARCH_ORION:
  892. {
  893. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  894. switch (hparams.n_layer) {
  895. case 40: type = LLM_TYPE_14B; break;
  896. default: type = LLM_TYPE_UNKNOWN;
  897. }
  898. } break;
  899. case LLM_ARCH_INTERNLM2:
  900. {
  901. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  902. switch (hparams.n_layer) {
  903. case 32: type = LLM_TYPE_7B; break;
  904. case 48: type = LLM_TYPE_20B; break;
  905. default: type = LLM_TYPE_UNKNOWN;
  906. }
  907. } break;
  908. case LLM_ARCH_GEMMA:
  909. {
  910. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  911. switch (hparams.n_layer) {
  912. case 18: type = LLM_TYPE_2B; break;
  913. case 28: type = LLM_TYPE_7B; break;
  914. default: type = LLM_TYPE_UNKNOWN;
  915. }
  916. } break;
  917. case LLM_ARCH_GEMMA2:
  918. {
  919. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  920. hparams.n_swa = 4096; // default value of gemma 2
  921. hparams.set_swa_pattern(2);
  922. hparams.attn_soft_cap = true;
  923. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  924. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  925. ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
  926. ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
  927. switch (hparams.n_layer) {
  928. case 26: type = LLM_TYPE_2B; break;
  929. case 42: type = LLM_TYPE_9B; break;
  930. case 46: type = LLM_TYPE_27B; break;
  931. default: type = LLM_TYPE_UNKNOWN;
  932. }
  933. // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
  934. hparams.f_attention_scale = type == LLM_TYPE_27B
  935. ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
  936. : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
  937. } break;
  938. case LLM_ARCH_GEMMA3:
  939. {
  940. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  941. hparams.set_swa_pattern(6);
  942. hparams.rope_freq_base_train_swa = 10000.0f;
  943. hparams.rope_freq_scale_train_swa = 1.0f;
  944. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  945. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  946. switch (hparams.n_layer) {
  947. case 26: type = LLM_TYPE_1B; break;
  948. case 34: type = LLM_TYPE_4B; break;
  949. case 48: type = LLM_TYPE_12B; break;
  950. case 62: type = LLM_TYPE_27B; break;
  951. default: type = LLM_TYPE_UNKNOWN;
  952. }
  953. // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
  954. hparams.f_attention_scale = type == LLM_TYPE_27B
  955. ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
  956. : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
  957. } break;
  958. case LLM_ARCH_GEMMA3N:
  959. {
  960. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  961. hparams.set_swa_pattern(5);
  962. hparams.rope_freq_base_train_swa = 10000.0f;
  963. hparams.rope_freq_scale_train_swa = 1.0f;
  964. hparams.f_attention_scale = 1.0f;
  965. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  966. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  967. switch (hparams.n_layer) {
  968. case 30: type = LLM_TYPE_E2B; break;
  969. case 35: type = LLM_TYPE_E4B; break;
  970. default: type = LLM_TYPE_UNKNOWN;
  971. }
  972. } break;
  973. case LLM_ARCH_STARCODER2:
  974. {
  975. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  976. switch (hparams.n_layer) {
  977. case 30: type = LLM_TYPE_3B; break;
  978. case 32: type = LLM_TYPE_7B; break;
  979. case 40: type = LLM_TYPE_15B; break;
  980. case 52: type = LLM_TYPE_20B; break; // granite
  981. case 88: type = LLM_TYPE_34B; break; // granite
  982. default: type = LLM_TYPE_UNKNOWN;
  983. }
  984. } break;
  985. case LLM_ARCH_MAMBA:
  986. {
  987. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  988. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  989. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  990. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  991. ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
  992. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  993. switch (hparams.n_layer) {
  994. case 24:
  995. switch (hparams.n_embd) {
  996. case 768: type = LLM_TYPE_SMALL; break;
  997. default: type = LLM_TYPE_UNKNOWN;
  998. } break;
  999. case 48:
  1000. switch (hparams.n_embd) {
  1001. case 1024: type = LLM_TYPE_MEDIUM; break;
  1002. case 1536: type = LLM_TYPE_LARGE; break;
  1003. case 2048: type = LLM_TYPE_XL; break;
  1004. default: type = LLM_TYPE_UNKNOWN;
  1005. } break;
  1006. case 64:
  1007. switch (hparams.n_embd) {
  1008. case 2560: type = LLM_TYPE_3B; break;
  1009. default: type = LLM_TYPE_UNKNOWN;
  1010. } break;
  1011. default: type = LLM_TYPE_UNKNOWN;
  1012. }
  1013. } break;
  1014. case LLM_ARCH_MAMBA2:
  1015. {
  1016. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1017. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1018. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1019. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1020. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1021. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1022. switch (hparams.n_layer) {
  1023. case 24:
  1024. switch (hparams.n_embd) {
  1025. case 768: type = LLM_TYPE_SMALL; break;
  1026. default: type = LLM_TYPE_UNKNOWN;
  1027. } break;
  1028. case 48:
  1029. switch (hparams.n_embd) {
  1030. case 1024: type = LLM_TYPE_MEDIUM; break;
  1031. case 1536: type = LLM_TYPE_LARGE; break;
  1032. case 2048: type = LLM_TYPE_XL; break;
  1033. default: type = LLM_TYPE_UNKNOWN;
  1034. } break;
  1035. case 64:
  1036. switch (hparams.n_embd) {
  1037. case 2560: type = LLM_TYPE_3B; break;
  1038. case 4096: type = LLM_TYPE_7B; break;
  1039. default: type = LLM_TYPE_UNKNOWN;
  1040. } break;
  1041. default: type = LLM_TYPE_UNKNOWN;
  1042. }
  1043. } break;
  1044. case LLM_ARCH_JAMBA:
  1045. {
  1046. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1047. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1048. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1049. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1050. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1051. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1052. hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
  1053. }
  1054. switch (hparams.n_layer) {
  1055. // TODO: Jamba layers are a bit heterogenous, so naming this is hard.
  1056. case 12: // 900M 8x???M
  1057. case 32: // 51B 16x?B
  1058. default: type = LLM_TYPE_UNKNOWN;
  1059. }
  1060. } break;
  1061. case LLM_ARCH_XVERSE:
  1062. {
  1063. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1064. switch (hparams.n_layer) {
  1065. case 32: type = LLM_TYPE_7B; break;
  1066. case 40: type = LLM_TYPE_13B; break;
  1067. case 80: type = LLM_TYPE_65B; break;
  1068. default: type = LLM_TYPE_UNKNOWN;
  1069. }
  1070. } break;
  1071. case LLM_ARCH_COMMAND_R:
  1072. {
  1073. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1074. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1075. switch (hparams.n_layer) {
  1076. case 40: type = LLM_TYPE_35B; break;
  1077. default: type = LLM_TYPE_UNKNOWN;
  1078. }
  1079. } break;
  1080. case LLM_ARCH_COHERE2:
  1081. {
  1082. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1083. hparams.set_swa_pattern(4);
  1084. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  1085. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1086. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1087. switch (hparams.n_layer) {
  1088. case 32: type = LLM_TYPE_8B; break;
  1089. default: type = LLM_TYPE_UNKNOWN;
  1090. }
  1091. } break;
  1092. case LLM_ARCH_DBRX:
  1093. {
  1094. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1095. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
  1096. switch (hparams.n_layer) {
  1097. case 40: type = LLM_TYPE_16x12B; break;
  1098. default: type = LLM_TYPE_UNKNOWN;
  1099. }
  1100. } break;
  1101. case LLM_ARCH_OLMO:
  1102. {
  1103. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1104. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  1105. switch (hparams.n_layer) {
  1106. case 22: type = LLM_TYPE_1B; break;
  1107. case 32: type = LLM_TYPE_7B; break;
  1108. case 80: type = LLM_TYPE_70B; break;
  1109. default: type = LLM_TYPE_UNKNOWN;
  1110. }
  1111. } break;
  1112. case LLM_ARCH_OLMO2:
  1113. {
  1114. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1115. switch (hparams.n_layer) {
  1116. case 16: type = LLM_TYPE_1B; break;
  1117. case 32: type = LLM_TYPE_7B; break;
  1118. case 40: type = LLM_TYPE_13B; break;
  1119. case 64: type = LLM_TYPE_32B; break;
  1120. default: type = LLM_TYPE_UNKNOWN;
  1121. }
  1122. } break;
  1123. case LLM_ARCH_OLMOE:
  1124. {
  1125. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1126. switch (hparams.n_layer) {
  1127. case 16: type = LLM_TYPE_A1_7B; break;
  1128. default: type = LLM_TYPE_UNKNOWN;
  1129. }
  1130. } break;
  1131. case LLM_ARCH_OPENELM:
  1132. {
  1133. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1134. switch (hparams.n_layer) {
  1135. case 16: type = LLM_TYPE_270M; break;
  1136. case 20: type = LLM_TYPE_450M; break;
  1137. case 28: type = LLM_TYPE_1B; break;
  1138. case 36: type = LLM_TYPE_3B; break;
  1139. default: type = LLM_TYPE_UNKNOWN;
  1140. }
  1141. } break;
  1142. case LLM_ARCH_GPTNEOX:
  1143. {
  1144. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1145. ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
  1146. switch (hparams.n_layer) {
  1147. case 6:
  1148. switch (hparams.n_ff()) {
  1149. case 512: type = LLM_TYPE_14M; break;
  1150. case 2048: type = LLM_TYPE_70M; break;
  1151. default: type = LLM_TYPE_UNKNOWN;
  1152. } break;
  1153. case 12:
  1154. switch (hparams.n_ff()) {
  1155. case 3072: type = LLM_TYPE_160M; break;
  1156. default: type = LLM_TYPE_UNKNOWN;
  1157. } break;
  1158. case 16:
  1159. switch (hparams.n_ff()) {
  1160. case 8192: type = LLM_TYPE_1B; break;
  1161. default: type = LLM_TYPE_UNKNOWN;
  1162. } break;
  1163. case 24:
  1164. switch (hparams.n_ff()) {
  1165. case 4096: type = LLM_TYPE_410M; break;
  1166. case 8192: type = LLM_TYPE_1_4B; break;
  1167. default: type = LLM_TYPE_UNKNOWN;
  1168. } break;
  1169. case 32:
  1170. switch (hparams.n_ff()) {
  1171. case 10240: type = LLM_TYPE_2_8B; break;
  1172. case 16384: type = LLM_TYPE_6_9B; break;
  1173. default: type = LLM_TYPE_UNKNOWN;
  1174. } break;
  1175. case 36:
  1176. switch (hparams.n_ff()) {
  1177. case 20480: type = LLM_TYPE_12B; break;
  1178. default: type = LLM_TYPE_UNKNOWN;
  1179. } break;
  1180. case 44:
  1181. switch (hparams.n_ff()) {
  1182. case 24576: type = LLM_TYPE_20B; break;
  1183. default: type = LLM_TYPE_UNKNOWN;
  1184. } break;
  1185. default: type = LLM_TYPE_UNKNOWN;
  1186. }
  1187. } break;
  1188. case LLM_ARCH_ARCTIC:
  1189. {
  1190. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1191. if (hparams.n_expert == 128) {
  1192. switch (hparams.n_layer) {
  1193. case 35: type = LLM_TYPE_10B_128x3_66B; break;
  1194. default: type = LLM_TYPE_UNKNOWN;
  1195. }
  1196. } else {
  1197. type = LLM_TYPE_UNKNOWN;
  1198. }
  1199. } break;
  1200. case LLM_ARCH_DEEPSEEK:
  1201. {
  1202. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1203. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1204. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1205. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1206. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1207. switch (hparams.n_layer) {
  1208. case 28: type = LLM_TYPE_20B; break;
  1209. default: type = LLM_TYPE_UNKNOWN;
  1210. }
  1211. } break;
  1212. case LLM_ARCH_DEEPSEEK2:
  1213. {
  1214. bool is_lite = (hparams.n_layer == 27);
  1215. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1216. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1217. if (!is_lite) {
  1218. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  1219. }
  1220. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  1221. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
  1222. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
  1223. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1224. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1225. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1226. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1227. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1228. if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
  1229. // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
  1230. // that have no expert_gating_func model parameter set
  1231. hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
  1232. }
  1233. ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
  1234. switch (hparams.n_layer) {
  1235. case 27: type = LLM_TYPE_16B; break;
  1236. case 60: type = LLM_TYPE_236B; break;
  1237. case 61: type = LLM_TYPE_671B; break;
  1238. default: type = LLM_TYPE_UNKNOWN;
  1239. }
  1240. } break;
  1241. case LLM_ARCH_PLM:
  1242. {
  1243. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1244. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  1245. switch (hparams.n_layer) {
  1246. case 32: type = LLM_TYPE_1_8B; break;
  1247. default: type = LLM_TYPE_UNKNOWN;
  1248. }
  1249. } break;
  1250. case LLM_ARCH_CHATGLM:
  1251. {
  1252. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1253. switch (hparams.n_layer) {
  1254. case 28: {
  1255. if (hparams.n_head(0) == 16) {
  1256. type = LLM_TYPE_1_5B;
  1257. } else {
  1258. type = LLM_TYPE_6B;
  1259. }
  1260. } break;
  1261. case 40: {
  1262. if (hparams.n_head(0) == 24) {
  1263. type = LLM_TYPE_4B;
  1264. } else {
  1265. type = LLM_TYPE_9B;
  1266. }
  1267. } break;
  1268. default: type = LLM_TYPE_UNKNOWN;
  1269. }
  1270. } break;
  1271. case LLM_ARCH_GLM4:
  1272. {
  1273. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1274. switch (hparams.n_layer) {
  1275. case 40: type = LLM_TYPE_9B; break;
  1276. case 61: type = LLM_TYPE_32B; break;
  1277. default: type = LLM_TYPE_UNKNOWN;
  1278. }
  1279. } break;
  1280. case LLM_ARCH_BITNET:
  1281. {
  1282. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1283. switch (hparams.n_layer) {
  1284. case 26: type = LLM_TYPE_3B; break;
  1285. default: type = LLM_TYPE_UNKNOWN;
  1286. }
  1287. } break;
  1288. case LLM_ARCH_T5:
  1289. {
  1290. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1291. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  1292. uint32_t dec_start_token_id;
  1293. if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
  1294. hparams.dec_start_token_id = dec_start_token_id;
  1295. }
  1296. switch (hparams.n_layer) {
  1297. case 6: type = LLM_TYPE_60M; break; // t5-small
  1298. case 8: type = LLM_TYPE_80M; break; // flan-t5-small
  1299. case 12:
  1300. switch (hparams.n_ff()) {
  1301. case 3072: type = LLM_TYPE_220M; break; // t5-base
  1302. case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
  1303. default: type = LLM_TYPE_UNKNOWN;
  1304. } break;
  1305. case 24:
  1306. switch (hparams.n_ff()) {
  1307. case 4096: type = LLM_TYPE_770M; break; // t5-large
  1308. case 2816: type = LLM_TYPE_780M; break; // flan-t5-large
  1309. case 16384: type = LLM_TYPE_3B; break; // t5-3b
  1310. case 5120: type = LLM_TYPE_3B; break; // flan-t5-xl
  1311. case 65536: type = LLM_TYPE_11B; break; // t5-11b
  1312. case 10240: type = LLM_TYPE_11B; break; // flan-t5-xxl
  1313. default: type = LLM_TYPE_UNKNOWN;
  1314. } break;
  1315. default: type = LLM_TYPE_UNKNOWN;
  1316. }
  1317. } break;
  1318. case LLM_ARCH_T5ENCODER:
  1319. {
  1320. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1321. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  1322. type = LLM_TYPE_UNKNOWN;
  1323. } break;
  1324. case LLM_ARCH_JAIS:
  1325. {
  1326. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1327. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  1328. switch (hparams.n_layer) {
  1329. case 24: type = LLM_TYPE_1_3B; break;
  1330. case 40: type = LLM_TYPE_13B; break;
  1331. /* TODO: add variants */
  1332. default: type = LLM_TYPE_UNKNOWN;
  1333. }
  1334. } break;
  1335. case LLM_ARCH_NEMOTRON:
  1336. {
  1337. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1338. switch (hparams.n_layer) {
  1339. case 32: type = LLM_TYPE_4B; break;
  1340. default: type = LLM_TYPE_UNKNOWN;
  1341. }
  1342. } break;
  1343. case LLM_ARCH_EXAONE:
  1344. {
  1345. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1346. switch (hparams.n_layer) {
  1347. case 32: type = LLM_TYPE_8B; break;
  1348. default: type = LLM_TYPE_UNKNOWN;
  1349. }
  1350. } break;
  1351. case LLM_ARCH_EXAONE4:
  1352. {
  1353. if (hparams.n_layer == 64) { // 32B
  1354. hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
  1355. hparams.n_swa = 4096;
  1356. hparams.set_swa_pattern(4);
  1357. }
  1358. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  1359. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1360. switch (hparams.n_layer) {
  1361. case 30: type = LLM_TYPE_1_2B; break;
  1362. case 64: type = LLM_TYPE_32B; break;
  1363. default: type = LLM_TYPE_UNKNOWN;
  1364. }
  1365. } break;
  1366. case LLM_ARCH_RWKV6:
  1367. case LLM_ARCH_RWKV6QWEN2:
  1368. {
  1369. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
  1370. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
  1371. ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
  1372. ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
  1373. ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
  1374. ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
  1375. ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
  1376. switch (hparams.n_layer) {
  1377. case 24: type = LLM_TYPE_1_6B; break;
  1378. case 32:
  1379. switch (hparams.n_embd) {
  1380. case 2560: type = LLM_TYPE_3B; break;
  1381. case 4096: type = LLM_TYPE_7B; break;
  1382. default: type = LLM_TYPE_UNKNOWN;
  1383. } break;
  1384. case 61: type = LLM_TYPE_14B; break;
  1385. case 64: type = LLM_TYPE_32B; break;
  1386. default: type = LLM_TYPE_UNKNOWN;
  1387. }
  1388. } break;
  1389. case LLM_ARCH_RWKV7:
  1390. case LLM_ARCH_ARWKV7:
  1391. {
  1392. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
  1393. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
  1394. ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
  1395. ml.get_key(LLM_KV_ATTENTION_DECAY_LORA_RANK, hparams.n_lora_decay);
  1396. ml.get_key(LLM_KV_ATTENTION_ICLR_LORA_RANK, hparams.n_lora_iclr);
  1397. ml.get_key(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix);
  1398. ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate, false);
  1399. ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
  1400. switch (hparams.n_layer) {
  1401. case 12:
  1402. switch (hparams.n_embd) {
  1403. case 768: type = LLM_TYPE_190M; break;
  1404. default: type = LLM_TYPE_UNKNOWN;
  1405. } break;
  1406. case 24:
  1407. switch (hparams.n_embd) {
  1408. case 1024: type = LLM_TYPE_450M; break;
  1409. case 2048: type = LLM_TYPE_1_5B; break;
  1410. default: type = LLM_TYPE_UNKNOWN;
  1411. } break;
  1412. case 28:
  1413. switch (hparams.n_embd) {
  1414. case 1536: type = LLM_TYPE_1_5B; break;
  1415. case 3584: type = LLM_TYPE_7B; break;
  1416. default: type = LLM_TYPE_UNKNOWN;
  1417. } break;
  1418. case 32:
  1419. switch (hparams.n_embd) {
  1420. case 2560: type = LLM_TYPE_2_9B; break;
  1421. case 4096: type = LLM_TYPE_7B; break;
  1422. default: type = LLM_TYPE_UNKNOWN;
  1423. } break;
  1424. case 61:
  1425. switch (hparams.n_embd) {
  1426. case 4096: type = LLM_TYPE_14B; break;
  1427. default: type = LLM_TYPE_UNKNOWN;
  1428. } break;
  1429. default: type = LLM_TYPE_UNKNOWN;
  1430. }
  1431. } break;
  1432. case LLM_ARCH_GRANITE:
  1433. case LLM_ARCH_GRANITE_MOE:
  1434. {
  1435. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1436. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1437. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  1438. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  1439. ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
  1440. // Granite uses rope_finetuned as a switch for rope, so default to true
  1441. bool rope_finetuned = true;
  1442. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  1443. hparams.rope_finetuned = rope_finetuned;
  1444. switch (hparams.n_layer) {
  1445. case 32: type = LLM_TYPE_3B; break;
  1446. case 40: type = LLM_TYPE_3B; break;
  1447. // Add additional layer/vocab/etc checks here for other model sizes
  1448. default: type = LLM_TYPE_UNKNOWN;
  1449. }
  1450. // For Granite MoE Shared
  1451. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
  1452. } break;
  1453. case LLM_ARCH_GRANITE_HYBRID:
  1454. {
  1455. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1456. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /* required */ false);
  1457. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /* required */ false);
  1458. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /* required */ false);
  1459. ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, /* required */ false);
  1460. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1461. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1462. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1463. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1464. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1465. // Granite uses rope_finetuned as a switch for rope, so default to true
  1466. bool rope_finetuned = true;
  1467. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  1468. hparams.rope_finetuned = rope_finetuned;
  1469. // A layer is recurrent IFF the n_head_kv value is set to 0
  1470. for (uint32_t i = 0; i < hparams.n_layer; ++i) {
  1471. hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
  1472. }
  1473. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1474. switch (hparams.n_layer) {
  1475. // TODO: Add llm type label (not sure this is useful)
  1476. default: type = LLM_TYPE_UNKNOWN;
  1477. }
  1478. // For Granite MoE Shared
  1479. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
  1480. } break;
  1481. case LLM_ARCH_CHAMELEON:
  1482. {
  1483. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1484. hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
  1485. ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
  1486. switch (hparams.n_layer) {
  1487. case 32: type = LLM_TYPE_7B; break;
  1488. case 48: type = LLM_TYPE_34B; break;
  1489. default: type = LLM_TYPE_UNKNOWN;
  1490. }
  1491. } break;
  1492. case LLM_ARCH_WAVTOKENIZER_DEC:
  1493. {
  1494. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1495. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
  1496. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
  1497. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  1498. } break;
  1499. case LLM_ARCH_BAILINGMOE:
  1500. {
  1501. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1502. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1503. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1504. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1505. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1506. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1507. switch (hparams.n_layer) {
  1508. case 28: type = LLM_TYPE_16B; break;
  1509. case 88: type = LLM_TYPE_290B; break;
  1510. default: type = LLM_TYPE_UNKNOWN;
  1511. }
  1512. } break;
  1513. case LLM_ARCH_DOTS1:
  1514. {
  1515. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1516. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1517. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1518. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  1519. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  1520. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  1521. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  1522. switch (hparams.n_layer) {
  1523. case 62: type = LLM_TYPE_142B; break;
  1524. default: type = LLM_TYPE_UNKNOWN;
  1525. }
  1526. } break;
  1527. case LLM_ARCH_ERNIE4_5:
  1528. case LLM_ARCH_ERNIE4_5_MOE:
  1529. {
  1530. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1531. if (arch == LLM_ARCH_ERNIE4_5_MOE) {
  1532. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1533. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  1534. ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
  1535. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  1536. }
  1537. switch (hparams.n_layer) {
  1538. case 18: type = LLM_TYPE_0_3B; break;
  1539. case 28: type = LLM_TYPE_21B_A3B; break;
  1540. case 54: type = LLM_TYPE_300B_A47B; break;
  1541. default: type = LLM_TYPE_UNKNOWN;
  1542. }
  1543. } break;
  1544. case LLM_ARCH_FALCON_H1:
  1545. {
  1546. // Common parameters
  1547. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1548. // SSM parameters
  1549. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  1550. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  1551. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  1552. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  1553. ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
  1554. std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
  1555. switch (hparams.n_layer) {
  1556. case 36:
  1557. type = LLM_TYPE_0_5B; break;
  1558. case 24:
  1559. type = LLM_TYPE_1_5B; break;
  1560. case 66:
  1561. type = LLM_TYPE_1B; break;
  1562. case 32:
  1563. type = LLM_TYPE_3B; break;
  1564. case 44:
  1565. type = LLM_TYPE_7B; break;
  1566. case 72:
  1567. type = LLM_TYPE_34B; break;
  1568. default:
  1569. type = LLM_TYPE_UNKNOWN;
  1570. }
  1571. } break;
  1572. case LLM_ARCH_HUNYUAN_MOE:
  1573. {
  1574. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1575. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  1576. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
  1577. switch (hparams.n_layer) {
  1578. case 32: type = LLM_TYPE_A13B; break;
  1579. default: type = LLM_TYPE_UNKNOWN;
  1580. }
  1581. } break;
  1582. case LLM_ARCH_SMOLLM3:
  1583. {
  1584. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1585. hparams.n_no_rope_layer_step = 4;
  1586. switch (hparams.n_layer) {
  1587. case 36: type = LLM_TYPE_3B; break;
  1588. default: type = LLM_TYPE_UNKNOWN;
  1589. }
  1590. } break;
  1591. case LLM_ARCH_LFM2:
  1592. {
  1593. ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
  1594. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1595. for (uint32_t il = 0; il < hparams.n_layer; ++il) {
  1596. hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
  1597. }
  1598. switch (hparams.n_embd) {
  1599. case 1024: type = LLM_TYPE_350M; break;
  1600. case 1536: type = LLM_TYPE_700M; break;
  1601. case 2048: type = LLM_TYPE_1_2B; break;
  1602. default: type = LLM_TYPE_UNKNOWN;
  1603. }
  1604. } break;
  1605. default: throw std::runtime_error("unsupported model architecture");
  1606. }
  1607. pimpl->n_bytes = ml.n_bytes;
  1608. pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
  1609. if (hparams.f_max_alibi_bias > 0.0f) {
  1610. hparams.use_alibi = true;
  1611. }
  1612. hparams.rope_type = llama_model_rope_type(this);
  1613. }
  1614. void llama_model::load_vocab(llama_model_loader & ml) {
  1615. const auto kv = LLM_KV(arch);
  1616. vocab.load(ml, kv);
  1617. }
  1618. bool llama_model::load_tensors(llama_model_loader & ml) {
  1619. const auto & split_mode = params.split_mode;
  1620. const auto & n_gpu_layers = params.n_gpu_layers;
  1621. const auto & use_mlock = params.use_mlock;
  1622. const auto & tensor_split = params.tensor_split;
  1623. const int n_layer = hparams.n_layer;
  1624. const bool use_mmap_buffer = true;
  1625. LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
  1626. // build a list of buffer types for the CPU and GPU devices
  1627. pimpl->cpu_buft_list = make_cpu_buft_list(devices);
  1628. for (auto * dev : devices) {
  1629. buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
  1630. // add CPU buffer types as a fallback
  1631. buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
  1632. pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
  1633. }
  1634. // calculate the split points
  1635. bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
  1636. std::vector<float> splits(n_devices());
  1637. if (all_zero) {
  1638. // default split, by free memory
  1639. for (size_t i = 0; i < n_devices(); ++i) {
  1640. ggml_backend_dev_t dev = devices[i];
  1641. size_t total;
  1642. size_t free;
  1643. ggml_backend_dev_memory(dev, &free, &total);
  1644. splits[i] = free;
  1645. }
  1646. } else {
  1647. std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
  1648. }
  1649. // sum and normalize the splits to get the split points
  1650. float split_sum = 0.0f;
  1651. for (size_t i = 0; i < n_devices(); ++i) {
  1652. split_sum += splits[i];
  1653. splits[i] = split_sum;
  1654. }
  1655. for (size_t i = 0; i < n_devices(); ++i) {
  1656. splits[i] /= split_sum;
  1657. }
  1658. ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  1659. if (cpu_dev == nullptr) {
  1660. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  1661. }
  1662. const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
  1663. const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
  1664. auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
  1665. const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
  1666. if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
  1667. LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
  1668. return {cpu_dev, &pimpl->cpu_buft_list};
  1669. }
  1670. const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
  1671. auto * dev = devices.at(layer_gpu);
  1672. LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
  1673. return {dev, &pimpl->gpu_buft_list.at(dev)};
  1674. };
  1675. // assign the input layer
  1676. // there is very little benefit to offloading the input layer, so always keep it on the CPU
  1677. pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
  1678. // assign the repeating layers to the devices according to the splits
  1679. pimpl->dev_layer.resize(n_layer);
  1680. for (int il = 0; il < n_layer; ++il) {
  1681. pimpl->dev_layer[il] = get_layer_buft_list(il);
  1682. }
  1683. // assign the output layer
  1684. pimpl->dev_output = get_layer_buft_list(n_layer);
  1685. // one ggml context per buffer type
  1686. int max_n_tensors = ml.n_tensors;
  1687. max_n_tensors += 1; // duplicated output tensor
  1688. max_n_tensors += n_layer*2; // duplicated rope freq tensors
  1689. const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
  1690. std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
  1691. auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
  1692. auto it = ctx_map.find(buft);
  1693. if (it == ctx_map.end()) {
  1694. ggml_init_params params = {
  1695. /*.mem_size =*/ ctx_size,
  1696. /*.mem_buffer =*/ NULL,
  1697. /*.no_alloc =*/ true,
  1698. };
  1699. ggml_context * ctx = ggml_init(params);
  1700. if (!ctx) {
  1701. throw std::runtime_error(format("failed to create ggml context"));
  1702. }
  1703. ctx_map[buft] = ctx;
  1704. pimpl->ctxs.emplace_back(ctx);
  1705. return ctx;
  1706. }
  1707. return it->second;
  1708. };
  1709. const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
  1710. const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
  1711. // create tensors for the weights
  1712. {
  1713. // note: cast to int64_t since we will use these for the tensor dimensions
  1714. const int64_t n_head = hparams.n_head();
  1715. const int64_t n_head_kv = hparams.n_head_kv();
  1716. const int64_t n_embd = hparams.n_embd;
  1717. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
  1718. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
  1719. const int64_t n_embd_head_k = hparams.n_embd_head_k;
  1720. const int64_t n_embd_head_v = hparams.n_embd_head_v;
  1721. const int64_t n_ff = hparams.n_ff();
  1722. const int64_t n_embd_gqa = n_embd_v_gqa;
  1723. const int64_t n_vocab = vocab.n_tokens();
  1724. const int64_t n_token_types = vocab.n_token_types();
  1725. const int64_t n_rot = hparams.n_rot;
  1726. const int64_t n_expert = hparams.n_expert;
  1727. const int64_t n_expert_used = hparams.n_expert_used;
  1728. const int64_t n_ctx_train = hparams.n_ctx_train;
  1729. if (n_expert > 0 && hparams.n_expert_used == 0) {
  1730. throw std::runtime_error("model has expert layers but no expert layers are used");
  1731. }
  1732. int n_moved_tensors = 0;
  1733. ggml_tensor * first_moved_tensor = nullptr;
  1734. ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
  1735. ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
  1736. auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
  1737. ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
  1738. if (!t_meta) {
  1739. if (flags & TENSOR_NOT_REQUIRED) {
  1740. return nullptr;
  1741. }
  1742. throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
  1743. }
  1744. // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
  1745. // the tensor is duplicated
  1746. // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
  1747. llm_tensor tn_tensor = tn.tensor;
  1748. if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
  1749. tn_tensor = LLM_TENSOR_OUTPUT;
  1750. }
  1751. llm_tensor_info info;
  1752. try {
  1753. info = llm_tensor_info_for(tn_tensor);
  1754. } catch (const std::out_of_range & e) {
  1755. throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
  1756. }
  1757. // skip unused tensors
  1758. if (info.op == GGML_OP_NONE) {
  1759. const size_t nbytes = ggml_nbytes(t_meta);
  1760. LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
  1761. ml.size_data -= nbytes;
  1762. ml.n_created++;
  1763. return nullptr;
  1764. }
  1765. // tensors with "bias" suffix are always used with GGML_OP_ADD
  1766. ggml_op op;
  1767. bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
  1768. if (bias) {
  1769. op = GGML_OP_ADD;
  1770. } else {
  1771. op = info.op;
  1772. }
  1773. // sanity checks
  1774. if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
  1775. if (tn.bid != -1) {
  1776. GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
  1777. }
  1778. } else {
  1779. if (tn.bid == -1) {
  1780. GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
  1781. }
  1782. }
  1783. // select the buffer type for this tensor
  1784. buft_list_t * buft_list;
  1785. switch (info.layer) {
  1786. case LLM_TENSOR_LAYER_INPUT:
  1787. buft_list = pimpl->dev_input.buft_list;
  1788. break;
  1789. case LLM_TENSOR_LAYER_OUTPUT:
  1790. buft_list = pimpl->dev_output.buft_list;
  1791. break;
  1792. case LLM_TENSOR_LAYER_REPEATING:
  1793. buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
  1794. break;
  1795. default:
  1796. GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
  1797. }
  1798. ggml_backend_buffer_type_t buft = nullptr;
  1799. // check overrides
  1800. if (ml.tensor_buft_overrides) {
  1801. std::string tensor_name = tn.str();
  1802. for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
  1803. std::regex pattern(overrides->pattern);
  1804. if (std::regex_search(tensor_name, pattern)) {
  1805. buft = overrides->buft;
  1806. LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
  1807. tensor_name.c_str(),
  1808. ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
  1809. ggml_backend_buft_name(buft));
  1810. break;
  1811. }
  1812. }
  1813. }
  1814. if (!buft) {
  1815. buft = select_weight_buft(hparams, t_meta, op, *buft_list);
  1816. if (!buft) {
  1817. throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
  1818. }
  1819. }
  1820. // avoid using a host buffer when using mmap
  1821. auto * buft_dev = ggml_backend_buft_get_device(buft);
  1822. if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
  1823. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  1824. if (!cpu_dev) {
  1825. throw std::runtime_error("no CPU backend found");
  1826. }
  1827. buft = ggml_backend_dev_buffer_type(cpu_dev);
  1828. }
  1829. if (buft != buft_list->front().second) {
  1830. n_moved_tensors++;
  1831. if (!first_moved_tensor) {
  1832. first_moved_tensor = t_meta;
  1833. first_moved_from_buft = buft_list->front().second;
  1834. first_moved_to_buft = buft;
  1835. }
  1836. }
  1837. ggml_context * ctx = ctx_for_buft(buft);
  1838. // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
  1839. if (flags & TENSOR_DUPLICATED) {
  1840. ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
  1841. if (t) {
  1842. return t;
  1843. }
  1844. }
  1845. return ml.create_tensor(ctx, tn, ne, flags);
  1846. };
  1847. layers.resize(n_layer);
  1848. // TODO: move to a separate function
  1849. const auto tn = LLM_TN(arch);
  1850. switch (arch) {
  1851. case LLM_ARCH_LLAMA:
  1852. case LLM_ARCH_REFACT:
  1853. case LLM_ARCH_MINICPM:
  1854. case LLM_ARCH_GRANITE:
  1855. case LLM_ARCH_GRANITE_MOE:
  1856. {
  1857. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1858. // output
  1859. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1860. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1861. // if output is NULL, init from the input tok embed
  1862. if (output == NULL) {
  1863. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1864. }
  1865. for (int i = 0; i < n_layer; ++i) {
  1866. auto & layer = layers[i];
  1867. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1868. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  1869. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1870. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  1871. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  1872. // optional bias tensors
  1873. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1874. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1875. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1876. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1877. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1878. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  1879. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1880. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1881. }
  1882. else {
  1883. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1884. }
  1885. if (n_expert == 0) {
  1886. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1887. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1888. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1889. // optional MLP bias
  1890. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  1891. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1892. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  1893. } else {
  1894. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  1895. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
  1896. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  1897. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  1898. // For Granite MoE Shared
  1899. if (hparams.n_ff_shexp > 0) {
  1900. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  1901. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  1902. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
  1903. }
  1904. }
  1905. }
  1906. } break;
  1907. case LLM_ARCH_LLAMA4:
  1908. {
  1909. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1910. // output
  1911. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1912. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1913. // if output is NULL, init from the input tok embed
  1914. if (output == NULL) {
  1915. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1916. }
  1917. GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0");
  1918. for (int i = 0; i < n_layer; ++i) {
  1919. bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0;
  1920. auto & layer = layers[i];
  1921. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1922. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  1923. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1924. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  1925. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  1926. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1927. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1928. if (is_moe_layer) {
  1929. int n_ff_exp = hparams.n_ff_exp;
  1930. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  1931. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  1932. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
  1933. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  1934. // Shared expert
  1935. const int64_t n_ff_shexp = n_ff_exp;
  1936. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  1937. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
  1938. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  1939. } else {
  1940. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1941. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1942. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1943. }
  1944. }
  1945. } break;
  1946. case LLM_ARCH_DECI:
  1947. {
  1948. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1949. // output
  1950. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1951. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1952. // if output is NULL, init from the input tok embed
  1953. if (output == NULL) {
  1954. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1955. }
  1956. for (int i = 0; i < n_layer; ++i) {
  1957. auto & layer = layers[i];
  1958. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
  1959. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
  1960. const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
  1961. const int64_t n_ff = hparams.n_ff(i);
  1962. const int64_t n_head = hparams.n_head(i);
  1963. const int64_t n_head_kv = hparams.n_head_kv(i);
  1964. if (n_head_kv == 0 && n_head > 0) {
  1965. // linear attention for DeciLMCausalModel
  1966. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1967. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1968. }
  1969. else if (n_head_kv > 0) {
  1970. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1971. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  1972. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1973. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  1974. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  1975. }
  1976. // optional bias tensors
  1977. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1978. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1979. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1980. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1981. if (n_ff > 0) {
  1982. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1983. }
  1984. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  1985. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1986. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1987. }
  1988. else {
  1989. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1990. }
  1991. if (n_ff > 0) {
  1992. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1993. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1994. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1995. }
  1996. // optional MLP bias
  1997. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  1998. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1999. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2000. }
  2001. } break;
  2002. case LLM_ARCH_MINICPM3:
  2003. {
  2004. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  2005. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  2006. const int64_t q_lora_rank = hparams.n_lora_q;
  2007. const int64_t kv_lora_rank = hparams.n_lora_kv;
  2008. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2009. // output
  2010. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2011. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2012. // if output is NULL, init from the input tok embed
  2013. if (output == NULL) {
  2014. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2015. }
  2016. for (int i = 0; i < n_layer; ++i) {
  2017. auto & layer = layers[i];
  2018. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2019. layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
  2020. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  2021. layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
  2022. layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
  2023. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
  2024. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
  2025. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
  2026. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2027. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2028. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2029. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2030. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2031. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2032. }
  2033. } break;
  2034. case LLM_ARCH_GROK:
  2035. {
  2036. if (n_expert == 0) {
  2037. throw std::runtime_error("Grok model cannot have zero experts");
  2038. }
  2039. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2040. // output
  2041. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2042. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2043. // if output is NULL, init from the input tok embed
  2044. if (output == NULL) {
  2045. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2046. }
  2047. for (int i = 0; i < n_layer; ++i) {
  2048. auto & layer = layers[i];
  2049. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2050. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2051. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2052. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2053. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2054. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  2055. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2056. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2057. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
  2058. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  2059. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2060. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  2061. }
  2062. } break;
  2063. case LLM_ARCH_DBRX:
  2064. {
  2065. if (n_expert == 0) {
  2066. throw std::runtime_error("DBRX model cannot have zero experts");
  2067. }
  2068. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2069. // output
  2070. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2071. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2072. for (int i = 0; i < n_layer; ++i) {
  2073. auto & layer = layers[i];
  2074. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2075. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2076. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2077. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  2078. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2079. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2080. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  2081. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2082. }
  2083. } break;
  2084. case LLM_ARCH_BAICHUAN:
  2085. {
  2086. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2087. {
  2088. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2089. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2090. }
  2091. for (int i = 0; i < n_layer; ++i) {
  2092. auto & layer = layers[i];
  2093. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2094. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2095. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2096. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2097. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2098. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2099. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2100. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2101. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2102. }
  2103. } break;
  2104. case LLM_ARCH_FALCON:
  2105. {
  2106. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2107. // output
  2108. {
  2109. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2110. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2111. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2112. if (!output) {
  2113. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
  2114. }
  2115. }
  2116. for (int i = 0; i < n_layer; ++i) {
  2117. auto & layer = layers[i];
  2118. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2119. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2120. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2121. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2122. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2123. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2124. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2125. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2126. }
  2127. } break;
  2128. case LLM_ARCH_STARCODER:
  2129. {
  2130. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2131. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  2132. // output
  2133. {
  2134. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2135. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2136. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2137. if (!output) {
  2138. // needs to be on GPU
  2139. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2140. }
  2141. }
  2142. for (int i = 0; i < n_layer; ++i) {
  2143. auto & layer = layers[i];
  2144. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2145. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2146. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2147. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2148. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2149. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2150. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2151. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2152. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2153. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2154. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2155. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2156. }
  2157. } break;
  2158. case LLM_ARCH_BERT:
  2159. case LLM_ARCH_NOMIC_BERT:
  2160. case LLM_ARCH_NOMIC_BERT_MOE:
  2161. {
  2162. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2163. type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
  2164. if (arch == LLM_ARCH_BERT) {
  2165. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  2166. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
  2167. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  2168. cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2169. cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2170. }
  2171. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  2172. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  2173. for (int i = 0; i < n_layer; ++i) {
  2174. auto & layer = layers[i];
  2175. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2176. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2177. if (!layer.wqkv) {
  2178. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2179. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2180. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2181. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2182. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2183. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2184. }
  2185. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2186. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  2187. layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
  2188. if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
  2189. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2190. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
  2191. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  2192. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2193. } else {
  2194. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2195. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2196. if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
  2197. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2198. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2199. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2200. } else {
  2201. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2202. }
  2203. }
  2204. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  2205. layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
  2206. }
  2207. } break;
  2208. case LLM_ARCH_NEO_BERT:
  2209. {
  2210. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2211. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
  2212. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  2213. cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2214. cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
  2215. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2216. for (int i = 0; i < n_layer; ++i) {
  2217. auto & layer = layers[i];
  2218. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2219. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2220. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2221. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2222. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff*2}, 0);
  2223. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2224. }
  2225. } break;
  2226. case LLM_ARCH_JINA_BERT_V2:
  2227. {
  2228. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
  2229. type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
  2230. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
  2231. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); //LayerNorm bias
  2232. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
  2233. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {1}, TENSOR_NOT_REQUIRED);
  2234. for (int i = 0; i < n_layer; ++i) {
  2235. auto & layer = layers[i]; // JinaBertLayer
  2236. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2237. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2238. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2239. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2240. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2241. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2242. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2243. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2244. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2245. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2246. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
  2247. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
  2248. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
  2249. layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
  2250. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2251. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2252. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2253. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
  2254. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2255. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2256. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  2257. layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
  2258. }
  2259. } break;
  2260. case LLM_ARCH_BLOOM:
  2261. {
  2262. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2263. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  2264. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  2265. // output
  2266. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2267. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2268. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2269. // if output is NULL, init from the input tok embed
  2270. if (output == NULL) {
  2271. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2272. }
  2273. for (int i = 0; i < n_layer; ++i) {
  2274. auto & layer = layers[i];
  2275. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2276. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2277. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2278. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2279. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2280. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2281. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2282. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2283. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2284. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2285. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2286. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2287. }
  2288. } break;
  2289. case LLM_ARCH_MPT:
  2290. {
  2291. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2292. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
  2293. // output
  2294. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2295. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  2296. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2297. if (!output) {
  2298. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
  2299. }
  2300. for (int i = 0; i < n_layer; ++i) {
  2301. auto & layer = layers[i];
  2302. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2303. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2304. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2305. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2306. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2307. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2308. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2309. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2310. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2311. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2312. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2313. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2314. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2315. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2316. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2317. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2318. // AWQ ScaleActivation layer
  2319. layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2320. }
  2321. } break;
  2322. case LLM_ARCH_STABLELM:
  2323. {
  2324. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2325. // output
  2326. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2327. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2328. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2329. for (int i = 0; i < n_layer; ++i) {
  2330. auto & layer = layers[i];
  2331. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2332. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2333. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2334. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2335. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2336. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2337. // optional bias tensors, present in Stable LM 2 1.6B
  2338. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2339. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2340. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2341. // optional q and k layernorms, present in StableLM 2 12B
  2342. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
  2343. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
  2344. // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
  2345. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2346. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2347. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2348. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2349. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2350. }
  2351. } break;
  2352. case LLM_ARCH_QWEN:
  2353. {
  2354. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2355. // output
  2356. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2357. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2358. for (int i = 0; i < n_layer; ++i) {
  2359. auto & layer = layers[i];
  2360. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2361. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
  2362. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
  2363. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2364. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2365. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
  2366. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
  2367. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}, 0);
  2368. }
  2369. } break;
  2370. case LLM_ARCH_QWEN2:
  2371. case LLM_ARCH_QWEN2VL:
  2372. case LLM_ARCH_DREAM:
  2373. {
  2374. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2375. // output
  2376. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2377. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2378. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, TENSOR_NOT_REQUIRED);
  2379. // if output is NULL, init from the input tok embed
  2380. if (output == NULL) {
  2381. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2382. }
  2383. for (int i = 0; i < n_layer; ++i) {
  2384. auto & layer = layers[i];
  2385. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2386. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2387. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2388. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2389. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2390. // optional bias tensors
  2391. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2392. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2393. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2394. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2395. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2396. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2397. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2398. }
  2399. } break;
  2400. case LLM_ARCH_QWEN2MOE:
  2401. {
  2402. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2403. // output
  2404. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2405. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2406. for (int i = 0; i < n_layer; ++i) {
  2407. auto & layer = layers[i];
  2408. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2409. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2410. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2411. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2412. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2413. // optional bias tensors
  2414. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2415. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2416. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2417. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2418. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2419. if (n_expert == 0) {
  2420. throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
  2421. }
  2422. if (n_expert_used == 0) {
  2423. throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
  2424. }
  2425. // MoE branch
  2426. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2427. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2428. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2429. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2430. // Shared expert branch
  2431. const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
  2432. layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
  2433. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2434. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
  2435. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  2436. }
  2437. } break;
  2438. case LLM_ARCH_QWEN3:
  2439. {
  2440. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2441. // output
  2442. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2443. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2444. // if output is NULL, init from the input tok embed
  2445. if (output == NULL) {
  2446. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2447. }
  2448. for (int i = 0; i < n_layer; ++i) {
  2449. auto & layer = layers[i];
  2450. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2451. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2452. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2453. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2454. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2455. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2456. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2457. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2458. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2459. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2460. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2461. }
  2462. } break;
  2463. case LLM_ARCH_QWEN3MOE:
  2464. {
  2465. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2466. // output
  2467. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2468. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2469. // if output is NULL, init from the input tok embed
  2470. if (output == NULL) {
  2471. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2472. }
  2473. for (int i = 0; i < n_layer; ++i) {
  2474. auto & layer = layers[i];
  2475. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2476. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2477. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2478. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2479. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2480. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2481. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2482. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2483. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2484. if (n_expert == 0) {
  2485. throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
  2486. }
  2487. if (n_expert_used == 0) {
  2488. throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
  2489. }
  2490. // MoE branch
  2491. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  2492. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2493. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2494. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2495. }
  2496. } break;
  2497. case LLM_ARCH_PHI2:
  2498. {
  2499. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2500. // output
  2501. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2502. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2503. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2504. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, 0);
  2505. for (int i = 0; i < n_layer; ++i) {
  2506. auto & layer = layers[i];
  2507. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2508. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2509. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2510. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2511. if (layer.wqkv == nullptr) {
  2512. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2513. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2514. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2515. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2516. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2517. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2518. }
  2519. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2520. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2521. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2522. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2523. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2524. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2525. }
  2526. } break;
  2527. case LLM_ARCH_PHI3:
  2528. {
  2529. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2530. // output
  2531. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2532. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2533. // if output is NULL, init from the input tok embed
  2534. if (output == NULL) {
  2535. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2536. }
  2537. for (int i = 0; i < n_layer; ++i) {
  2538. auto & layer = layers[i];
  2539. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  2540. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
  2541. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  2542. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  2543. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  2544. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
  2545. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2546. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2547. }
  2548. } break;
  2549. case LLM_ARCH_PHIMOE:
  2550. {
  2551. const int64_t n_embd_head = n_embd / n_head;
  2552. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2553. // output
  2554. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2555. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2556. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
  2557. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0);
  2558. for (int i = 0; i < n_layer; ++i) {
  2559. auto & layer = layers[i];
  2560. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  2561. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
  2562. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
  2563. if (layer.wqkv == nullptr) {
  2564. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2565. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2566. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2567. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2568. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2569. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2570. }
  2571. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  2572. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
  2573. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  2574. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
  2575. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2576. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2577. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  2578. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2579. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2580. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2581. }
  2582. } break;
  2583. case LLM_ARCH_PLAMO:
  2584. {
  2585. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2586. // output
  2587. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2588. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2589. for (int i = 0; i < n_layer; ++i) {
  2590. auto & layer = layers[i];
  2591. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2592. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2593. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2594. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2595. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2596. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2597. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2598. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2599. }
  2600. } break;
  2601. case LLM_ARCH_PLAMO2:
  2602. {
  2603. const uint32_t d_conv = hparams.ssm_d_conv;
  2604. const uint32_t d_state = hparams.ssm_d_state;
  2605. const uint32_t num_heads = hparams.ssm_dt_rank;
  2606. const uint32_t intermediate_size = hparams.ssm_d_inner;
  2607. const uint32_t head_dim = intermediate_size / num_heads;
  2608. const uint32_t qk_dim = head_dim;
  2609. const uint32_t v_dim = head_dim;
  2610. const int64_t num_attention_heads = hparams.n_head();
  2611. const int64_t q_num_heads = num_attention_heads;
  2612. const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
  2613. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2614. // output
  2615. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2616. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2617. // if output is NULL, init from the input tok embed
  2618. if (output == NULL) {
  2619. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2620. }
  2621. for (int i = 0; i < n_layer; ++i) {
  2622. auto & layer = layers[i];
  2623. bool is_mamba_layer = hparams.is_recurrent(i);
  2624. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2625. if (is_mamba_layer) {
  2626. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2 * intermediate_size}, 0);
  2627. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, intermediate_size}, 0);
  2628. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {intermediate_size, dt_dim + 2*d_state}, 0);
  2629. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_dim, num_heads}, 0);
  2630. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {num_heads}, 0);
  2631. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {num_heads}, 0);
  2632. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {num_heads}, 0);
  2633. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {intermediate_size, n_embd}, 0);
  2634. layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, i), {dt_dim}, 0);
  2635. layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
  2636. layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
  2637. } else {
  2638. const int64_t num_key_value_heads = hparams.n_head_kv(i);
  2639. const int64_t k_num_heads = num_key_value_heads;
  2640. const int64_t v_num_heads = num_key_value_heads;
  2641. const int64_t q_proj_dim = q_num_heads * qk_dim;
  2642. const int64_t k_proj_dim = k_num_heads * qk_dim;
  2643. const int64_t v_proj_dim = v_num_heads * v_dim;
  2644. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
  2645. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim, num_attention_heads}, 0);
  2646. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim, k_num_heads}, 0);
  2647. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
  2648. }
  2649. // All layers have post-attention norm, FFN norm, and FFN tensors
  2650. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
  2651. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2652. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2653. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  2654. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
  2655. }
  2656. } break;
  2657. case LLM_ARCH_GPT2:
  2658. {
  2659. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2660. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  2661. // output
  2662. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2663. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2664. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2665. // if output is NULL, init from the input tok embed
  2666. if (output == NULL) {
  2667. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2668. }
  2669. for (int i = 0; i < n_layer; ++i) {
  2670. auto & layer = layers[i];
  2671. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2672. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2673. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2674. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2675. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2676. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2677. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2678. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2679. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2680. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2681. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2682. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2683. }
  2684. } break;
  2685. case LLM_ARCH_CODESHELL:
  2686. {
  2687. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2688. // if tok embd is NULL, init from output
  2689. if (tok_embd == NULL) {
  2690. tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2691. }
  2692. // output
  2693. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2694. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2695. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2696. for (int i = 0; i < n_layer; ++i) {
  2697. auto & layer = layers[i];
  2698. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2699. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2700. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2701. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2702. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2703. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2704. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2705. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2706. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2707. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2708. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2709. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2710. }
  2711. } break;
  2712. case LLM_ARCH_ORION:
  2713. {
  2714. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2715. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2716. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2717. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2718. for (int i = 0; i < n_layer; ++i) {
  2719. auto & layer = layers[i];
  2720. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2721. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2722. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2723. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2724. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2725. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2726. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2727. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2728. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2729. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2730. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2731. }
  2732. } break;
  2733. case LLM_ARCH_INTERNLM2:
  2734. {
  2735. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2736. // output
  2737. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2738. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2739. for (int i = 0; i < n_layer; ++i) {
  2740. auto & layer = layers[i];
  2741. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2742. // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2743. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2744. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2745. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2746. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2747. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2748. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2749. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2750. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2751. }
  2752. } break;
  2753. case LLM_ARCH_GEMMA:
  2754. {
  2755. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2756. // output
  2757. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2758. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  2759. for (int i = 0; i < n_layer; ++i) {
  2760. auto & layer = layers[i];
  2761. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2762. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2763. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2764. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2765. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2766. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2767. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2768. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2769. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2770. }
  2771. } break;
  2772. case LLM_ARCH_GEMMA2:
  2773. {
  2774. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2775. // output
  2776. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2777. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  2778. for (int i = 0; i < n_layer; ++i) {
  2779. auto & layer = layers[i];
  2780. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2781. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2782. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2783. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2784. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2785. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  2786. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2787. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2788. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2789. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2790. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  2791. }
  2792. } break;
  2793. case LLM_ARCH_GEMMA3:
  2794. {
  2795. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2796. // output
  2797. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2798. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2799. // if output is NULL, init from the input tok embed
  2800. if (output == NULL) {
  2801. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2802. }
  2803. for (int i = 0; i < n_layer; ++i) {
  2804. auto & layer = layers[i];
  2805. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2806. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2807. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2808. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2809. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2810. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  2811. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2812. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2813. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2814. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2815. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2816. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2817. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  2818. }
  2819. } break;
  2820. case LLM_ARCH_GEMMA3N:
  2821. {
  2822. const int64_t n_altup = hparams.n_altup;
  2823. const int64_t laurel_rank = hparams.laurel_rank;
  2824. const int64_t n_embd_altup = hparams.n_embd_altup;
  2825. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2826. // if output is NULL, init from the input tok embed
  2827. if (output == NULL) {
  2828. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2829. }
  2830. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2831. tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0);
  2832. altup_proj = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
  2833. altup_unembd_proj = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
  2834. per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_altup * n_layer}, 0);
  2835. per_layer_proj_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM, "weight"), {n_embd_altup}, 0);
  2836. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2837. for (int i = 0; i < n_layer; ++i) {
  2838. auto & layer = layers[i];
  2839. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2840. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2841. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2842. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2843. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2844. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2845. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2846. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  2847. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2848. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2849. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2850. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2851. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  2852. // altup & laurel
  2853. layer.per_layer_inp_gate = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE, "weight", i), {n_embd, n_embd_altup}, 0);
  2854. layer.per_layer_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ, "weight", i), {n_embd_altup, n_embd}, 0);
  2855. layer.per_layer_post_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
  2856. layer.altup_correct_coef = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF, "weight", i), {n_altup, n_altup}, 0);
  2857. layer.altup_correct_scale = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0);
  2858. layer.altup_predict_coef = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF, "weight", i), {n_altup, n_altup * n_altup}, 0);
  2859. layer.altup_router = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER, "weight", i), {n_embd, n_altup}, 0);
  2860. layer.altup_router_norm = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM, "weight", i), {n_embd}, 0);
  2861. layer.laurel_l = create_tensor(tn(LLM_TENSOR_LAUREL_L, "weight", i), {n_embd, laurel_rank}, 0);
  2862. layer.laurel_r = create_tensor(tn(LLM_TENSOR_LAUREL_R, "weight", i), {laurel_rank, n_embd}, 0);
  2863. layer.laurel_post_norm = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM, "weight", i), {n_embd}, 0);
  2864. }
  2865. } break;
  2866. case LLM_ARCH_STARCODER2:
  2867. {
  2868. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2869. // output
  2870. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2871. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2872. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2873. // if output is NULL, init from the input tok embed
  2874. if (output == NULL) {
  2875. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2876. }
  2877. for (int i = 0; i < n_layer; ++i) {
  2878. auto & layer = layers[i];
  2879. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2880. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2881. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2882. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2883. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2884. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2885. // optional bias tensors
  2886. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2887. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2888. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2889. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2890. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2891. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2892. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2893. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2894. // optional bias tensors
  2895. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2896. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}, 0);
  2897. }
  2898. } break;
  2899. case LLM_ARCH_MAMBA:
  2900. {
  2901. const int64_t d_conv = hparams.ssm_d_conv;
  2902. const int64_t d_inner = hparams.ssm_d_inner;
  2903. const int64_t d_state = hparams.ssm_d_state;
  2904. const int64_t dt_rank = hparams.ssm_dt_rank;
  2905. // only an expansion factor of 2 is supported for now
  2906. if (2 * n_embd != d_inner) {
  2907. throw std::runtime_error("only an expansion factor of 2 is supported for now");
  2908. }
  2909. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2910. // output
  2911. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2912. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2913. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  2914. if (output == NULL) {
  2915. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2916. }
  2917. for (int i = 0; i < n_layer; ++i) {
  2918. auto & layer = layers[i];
  2919. // norm
  2920. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2921. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
  2922. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
  2923. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
  2924. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
  2925. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
  2926. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
  2927. // no "weight" suffix for these
  2928. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
  2929. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
  2930. // out_proj
  2931. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  2932. }
  2933. } break;
  2934. case LLM_ARCH_MAMBA2:
  2935. {
  2936. const int64_t d_conv = hparams.ssm_d_conv;
  2937. const int64_t d_inner = hparams.ssm_d_inner;
  2938. const int64_t d_state = hparams.ssm_d_state;
  2939. const int64_t n_head = hparams.ssm_dt_rank;
  2940. const int64_t n_group = hparams.ssm_n_group;
  2941. const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
  2942. // only an expansion factor of 2 is supported for now
  2943. GGML_ASSERT(2 * n_embd == d_inner);
  2944. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2945. // output
  2946. {
  2947. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2948. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2949. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  2950. if (output == NULL) {
  2951. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2952. }
  2953. }
  2954. for (int i = 0; i < n_layer; ++i) {
  2955. auto & layer = layers[i];
  2956. // norm
  2957. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2958. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
  2959. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
  2960. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);
  2961. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
  2962. // no "weight" suffix for these
  2963. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
  2964. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
  2965. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
  2966. // out_proj
  2967. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  2968. }
  2969. } break;
  2970. case LLM_ARCH_JAMBA:
  2971. {
  2972. const int64_t d_conv = hparams.ssm_d_conv;
  2973. const int64_t d_inner = hparams.ssm_d_inner;
  2974. const int64_t d_state = hparams.ssm_d_state;
  2975. const int64_t dt_rank = hparams.ssm_dt_rank;
  2976. // only an expansion factor of 2 is supported for now
  2977. GGML_ASSERT(2 * n_embd == d_inner);
  2978. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2979. // output
  2980. {
  2981. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2982. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2983. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  2984. if (output == NULL) {
  2985. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2986. }
  2987. }
  2988. for (int i = 0; i < n_layer; ++i) {
  2989. const int64_t n_head_kv = hparams.n_head_kv(i);
  2990. const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
  2991. auto & layer = layers[i];
  2992. // norm
  2993. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2994. if (n_head_kv == 0) {
  2995. // Mamba layer
  2996. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
  2997. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
  2998. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
  2999. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
  3000. layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, "weight", i), {dt_rank}, 0);
  3001. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
  3002. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
  3003. layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, "weight", i), {d_state}, 0);
  3004. layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, "weight", i), {d_state}, 0);
  3005. // no "weight" suffix for these
  3006. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
  3007. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
  3008. // out_proj
  3009. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3010. } else {
  3011. // Attention layers
  3012. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3013. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3014. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3015. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3016. }
  3017. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3018. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
  3019. if (layer.ffn_gate_inp) {
  3020. // MoE
  3021. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3022. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  3023. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3024. } else {
  3025. // FFN (no MoE)
  3026. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3027. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3028. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3029. }
  3030. }
  3031. } break;
  3032. case LLM_ARCH_GRANITE_HYBRID:
  3033. {
  3034. // mamba2 Mixer SSM params
  3035. // NOTE: int64_t for tensor dimensions
  3036. const int64_t d_conv = hparams.ssm_d_conv;
  3037. const int64_t d_inner = hparams.ssm_d_inner;
  3038. const int64_t d_state = hparams.ssm_d_state;
  3039. const int64_t n_ssm_head = hparams.ssm_dt_rank;
  3040. const int64_t n_group = hparams.ssm_n_group;
  3041. const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
  3042. // only an expansion factor of 2 is supported for now
  3043. GGML_ASSERT(2 * n_embd == d_inner);
  3044. // embeddings
  3045. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3046. // output
  3047. {
  3048. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3049. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3050. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  3051. if (output == NULL) {
  3052. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3053. }
  3054. }
  3055. for (int i = 0; i < n_layer; ++i) {
  3056. auto & layer = layers[i];
  3057. // norm
  3058. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3059. if (hparams.is_recurrent(i)) {
  3060. // ssm layers
  3061. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
  3062. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
  3063. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
  3064. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
  3065. // no "weight" suffix for these
  3066. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
  3067. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
  3068. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
  3069. // out_proj
  3070. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  3071. } else {
  3072. // attention layers (with optional bias)
  3073. const int64_t n_head_i = hparams.n_head(i);
  3074. const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
  3075. const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
  3076. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
  3077. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
  3078. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
  3079. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
  3080. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3081. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
  3082. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
  3083. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3084. }
  3085. // feed forward (w/ optional biases)
  3086. if (n_expert > 0) {
  3087. // MoE FFN
  3088. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3089. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3090. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3091. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
  3092. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  3093. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3094. // For Granite MoE Shared
  3095. if (hparams.n_ff_shexp > 0) {
  3096. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  3097. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  3098. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
  3099. }
  3100. } else {
  3101. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3102. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3103. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3104. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3105. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3106. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  3107. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3108. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  3109. }
  3110. }
  3111. } break;
  3112. case LLM_ARCH_XVERSE:
  3113. {
  3114. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3115. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3116. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3117. for (int i = 0; i < n_layer; ++i) {
  3118. auto & layer = layers[i];
  3119. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3120. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3121. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3122. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3123. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3124. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3125. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3126. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3127. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3128. }
  3129. } break;
  3130. case LLM_ARCH_COMMAND_R:
  3131. {
  3132. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3133. // output
  3134. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3135. // init output from the input tok embed
  3136. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3137. for (int i = 0; i < n_layer; ++i) {
  3138. auto & layer = layers[i];
  3139. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3140. if (n_layer >= 64){
  3141. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
  3142. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
  3143. }
  3144. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3145. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3146. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3147. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3148. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3149. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3150. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3151. }
  3152. } break;
  3153. case LLM_ARCH_COHERE2:
  3154. {
  3155. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  3156. // output
  3157. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  3158. // init output from the input tok embed
  3159. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
  3160. TENSOR_DUPLICATED);
  3161. for (int i = 0; i < n_layer; ++i) {
  3162. auto & layer = layers[i];
  3163. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  3164. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
  3165. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
  3166. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
  3167. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  3168. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
  3169. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  3170. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
  3171. }
  3172. }
  3173. break;
  3174. case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
  3175. {
  3176. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3177. // output
  3178. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3179. // if output is NULL, init from the input tok embed
  3180. if (output == NULL) {
  3181. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3182. }
  3183. for (int i = 0; i < n_layer; ++i) {
  3184. auto & layer = layers[i];
  3185. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3186. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3187. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3188. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3189. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3190. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3191. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3192. }
  3193. } break;
  3194. case LLM_ARCH_OLMO2:
  3195. {
  3196. const int64_t n_embd_head = n_embd / n_head;
  3197. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3198. // output
  3199. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3200. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3201. for (int i = 0; i < n_layer; ++i) {
  3202. auto & layer = layers[i];
  3203. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3204. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3205. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3206. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3207. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
  3208. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
  3209. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3210. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3211. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3212. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3213. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3214. }
  3215. } break;
  3216. case LLM_ARCH_OLMOE:
  3217. {
  3218. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3219. // output
  3220. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3221. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3222. for (int i = 0; i < n_layer; ++i) {
  3223. auto & layer = layers[i];
  3224. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3225. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3226. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3227. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3228. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3229. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
  3230. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
  3231. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3232. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3233. if (n_expert == 0) {
  3234. throw std::runtime_error("n_expert must be > 0");
  3235. }
  3236. if (n_expert_used == 0) {
  3237. throw std::runtime_error("n_expert_used must be > 0");
  3238. }
  3239. // MoE branch
  3240. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3241. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  3242. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3243. }
  3244. } break;
  3245. case LLM_ARCH_OPENELM:
  3246. {
  3247. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3248. // output
  3249. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3250. // init output from the input tok embed
  3251. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3252. for (int i = 0; i < n_layer; ++i) {
  3253. const int64_t n_head = hparams.n_head(i);
  3254. const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head;
  3255. const int64_t n_ff = hparams.n_ff(i);
  3256. auto & layer = layers[i];
  3257. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3258. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
  3259. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  3260. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  3261. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
  3262. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3263. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3264. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3265. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3266. }
  3267. } break;
  3268. case LLM_ARCH_GPTNEOX:
  3269. {
  3270. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3271. // output
  3272. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3273. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3274. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3275. for (int i = 0; i < n_layer; ++i) {
  3276. auto & layer = layers[i];
  3277. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3278. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3279. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3280. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  3281. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3282. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3283. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3284. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3285. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3286. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3287. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3288. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  3289. }
  3290. } break;
  3291. case LLM_ARCH_ARCTIC:
  3292. {
  3293. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3294. // output
  3295. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3296. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3297. // if output is NULL, init from the input tok embed
  3298. if (output == NULL) {
  3299. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3300. }
  3301. for (int i = 0; i < n_layer; ++i) {
  3302. auto & layer = layers[i];
  3303. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3304. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3305. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3306. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3307. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3308. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3309. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
  3310. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
  3311. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd}, 0);
  3312. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3313. layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
  3314. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
  3315. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  3316. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  3317. }
  3318. } break;
  3319. case LLM_ARCH_DEEPSEEK:
  3320. {
  3321. const int64_t n_ff_exp = hparams.n_ff_exp;
  3322. const int64_t n_expert_shared = hparams.n_expert_shared;
  3323. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3324. // output
  3325. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3326. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3327. for (int i = 0; i < n_layer; ++i) {
  3328. auto & layer = layers[i];
  3329. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3330. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3331. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3332. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3333. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3334. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3335. if (i < (int) hparams.n_layer_dense_lead) {
  3336. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3337. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3338. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3339. } else {
  3340. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3341. if (n_expert == 0) {
  3342. throw std::runtime_error("n_expert must be > 0");
  3343. }
  3344. if (n_expert_used == 0) {
  3345. throw std::runtime_error("n_expert_used must be > 0");
  3346. }
  3347. // MoE branch
  3348. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3349. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  3350. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3351. // Shared expert branch
  3352. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3353. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  3354. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3355. }
  3356. }
  3357. } break;
  3358. case LLM_ARCH_DEEPSEEK2:
  3359. {
  3360. const bool is_lite = (hparams.n_layer == 27);
  3361. const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
  3362. // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
  3363. const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
  3364. const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
  3365. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  3366. const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
  3367. const int64_t q_lora_rank = hparams.n_lora_q;
  3368. const int64_t kv_lora_rank = hparams.n_lora_kv;
  3369. const int64_t n_ff_exp = hparams.n_ff_exp;
  3370. const int64_t n_expert_shared = hparams.n_expert_shared;
  3371. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3372. // output
  3373. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3374. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3375. for (int i = 0; i < n_layer; ++i) {
  3376. auto & layer = layers[i];
  3377. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3378. if (!is_lite) {
  3379. layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
  3380. }
  3381. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  3382. if (!is_lite) {
  3383. layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
  3384. layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
  3385. } else {
  3386. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
  3387. }
  3388. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
  3389. // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
  3390. if (is_mla) {
  3391. layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
  3392. layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
  3393. } else {
  3394. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
  3395. }
  3396. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
  3397. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3398. if (i < (int) hparams.n_layer_dense_lead) {
  3399. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3400. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3401. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3402. } else {
  3403. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  3404. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  3405. if (n_expert == 0) {
  3406. throw std::runtime_error("n_expert must be > 0");
  3407. }
  3408. if (n_expert_used == 0) {
  3409. throw std::runtime_error("n_expert_used must be > 0");
  3410. }
  3411. // MoE branch
  3412. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3413. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  3414. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  3415. // Shared expert branch
  3416. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3417. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  3418. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  3419. }
  3420. }
  3421. } break;
  3422. case LLM_ARCH_PLM:
  3423. {
  3424. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  3425. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  3426. const int64_t kv_lora_rank = hparams.n_lora_kv;
  3427. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3428. // output
  3429. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3430. // output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3431. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3432. for (int i = 0; i < n_layer; ++i) {
  3433. auto & layer = layers[i];
  3434. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3435. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3436. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
  3437. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  3438. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
  3439. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
  3440. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3441. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3442. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3443. }
  3444. } break;
  3445. case LLM_ARCH_BITNET:
  3446. {
  3447. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3448. // output
  3449. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3450. for (int i = 0; i < n_layer; ++i) {
  3451. auto & layer = layers[i];
  3452. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3453. layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
  3454. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3455. layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3456. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3457. layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3458. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3459. layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3460. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3461. layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3462. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3463. layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
  3464. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3465. layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3466. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3467. layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3468. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3469. layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  3470. }
  3471. } break;
  3472. case LLM_ARCH_T5:
  3473. {
  3474. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  3475. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3476. // output
  3477. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3478. output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3479. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3480. // if output is NULL, init from the input tok embed
  3481. if (output == NULL) {
  3482. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3483. }
  3484. for (int i = 0; i < n_layer; ++i) {
  3485. auto & layer = layers[i];
  3486. layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
  3487. layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  3488. layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3489. layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3490. layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3491. layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  3492. layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
  3493. layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  3494. layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3495. layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3496. layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
  3497. layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  3498. layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3499. layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3500. layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3501. layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  3502. layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
  3503. // this tensor seems to be unused in HF transformers implementation
  3504. layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  3505. layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3506. layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3507. layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3508. layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  3509. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
  3510. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  3511. layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3512. layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3513. }
  3514. } break;
  3515. case LLM_ARCH_T5ENCODER:
  3516. {
  3517. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  3518. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3519. // output
  3520. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3521. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3522. // if output is NULL, init from the input tok embed
  3523. if (output == NULL) {
  3524. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3525. }
  3526. for (int i = 0; i < n_layer; ++i) {
  3527. auto & layer = layers[i];
  3528. layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
  3529. layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  3530. layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3531. layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3532. layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3533. layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  3534. layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
  3535. layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  3536. layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3537. layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3538. }
  3539. } break;
  3540. case LLM_ARCH_JAIS:
  3541. {
  3542. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3543. // output
  3544. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3545. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3546. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3547. for (int i = 0; i < n_layer; ++i) {
  3548. auto & layer = layers[i];
  3549. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3550. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3551. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  3552. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  3553. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3554. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  3555. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3556. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3557. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3558. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  3559. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3560. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, 0);
  3561. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3562. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  3563. }
  3564. } break;
  3565. case LLM_ARCH_CHATGLM:
  3566. {
  3567. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3568. // output
  3569. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3570. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3571. // if output is NULL, init from the input tok embed
  3572. if (output == NULL) {
  3573. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3574. }
  3575. for (int i = 0; i < n_layer; ++i) {
  3576. auto & layer = layers[i];
  3577. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3578. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3579. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3580. if (layer.wqkv == nullptr) {
  3581. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3582. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3583. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3584. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3585. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3586. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3587. }
  3588. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3589. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3590. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  3591. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  3592. }
  3593. } break;
  3594. case LLM_ARCH_GLM4:
  3595. {
  3596. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3597. // output
  3598. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3599. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3600. // if output is NULL, init from the input tok embed
  3601. if (output == NULL) {
  3602. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3603. }
  3604. for (int i = 0; i < n_layer; ++i) {
  3605. auto & layer = layers[i];
  3606. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3607. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3608. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3609. if (layer.wqkv == nullptr) {
  3610. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3611. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3612. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3613. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3614. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3615. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3616. }
  3617. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3618. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3619. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3620. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3621. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  3622. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3623. }
  3624. } break;
  3625. case LLM_ARCH_NEMOTRON:
  3626. {
  3627. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3628. // output
  3629. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3630. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3631. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3632. for (int i = 0; i < n_layer; ++i) {
  3633. auto & layer = layers[i];
  3634. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3635. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3636. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3637. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3638. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3639. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3640. // optional bias tensors
  3641. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3642. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3643. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  3644. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3645. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3646. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  3647. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3648. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3649. // optional MLP bias
  3650. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3651. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  3652. }
  3653. } break;
  3654. case LLM_ARCH_EXAONE:
  3655. {
  3656. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3657. // output
  3658. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3659. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3660. // if output is NULL, init from the input tok embed
  3661. if (output == NULL) {
  3662. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3663. }
  3664. for (int i = 0; i < n_layer; ++i) {
  3665. auto & layer = layers[i];
  3666. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3667. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3668. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3669. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3670. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  3671. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3672. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3673. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3674. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3675. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3676. }
  3677. } break;
  3678. case LLM_ARCH_EXAONE4:
  3679. {
  3680. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3681. // output
  3682. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3683. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3684. // if output is NULL, init from the input tok embed
  3685. if (output == NULL) {
  3686. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3687. }
  3688. for (int i = 0; i < n_layer; ++i) {
  3689. auto & layer = layers[i];
  3690. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  3691. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  3692. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  3693. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3694. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  3695. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  3696. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  3697. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  3698. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3699. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3700. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3701. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  3702. }
  3703. } break;
  3704. case LLM_ARCH_RWKV6:
  3705. {
  3706. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3707. // Block 0, LN0
  3708. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  3709. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  3710. // output
  3711. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3712. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3713. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3714. const int time_mix_extra_dim = hparams.time_mix_extra_dim;
  3715. const int time_decay_extra_dim = hparams.time_decay_extra_dim;
  3716. const int head_size = hparams.wkv_head_size;
  3717. const int attn_hidden_size = n_embd;
  3718. const int ffn_size = hparams.n_ff_arr[0];
  3719. for (int i = 0; i < n_layer; ++i) {
  3720. auto & layer = layers[i];
  3721. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3722. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3723. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
  3724. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
  3725. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
  3726. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
  3727. layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
  3728. layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  3729. layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  3730. layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  3731. layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  3732. layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
  3733. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED);
  3734. GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
  3735. layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
  3736. layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
  3737. layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
  3738. layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
  3739. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  3740. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  3741. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  3742. layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
  3743. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
  3744. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
  3745. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  3746. layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
  3747. layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
  3748. layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
  3749. layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
  3750. layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
  3751. }
  3752. } break;
  3753. case LLM_ARCH_RWKV6QWEN2:
  3754. {
  3755. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3756. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3757. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  3758. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3759. const int time_mix_extra_dim = hparams.time_mix_extra_dim;
  3760. const int time_decay_extra_dim = hparams.time_decay_extra_dim;
  3761. const int head_size = hparams.wkv_head_size;
  3762. const int attn_hidden_size = n_embd;
  3763. const int n_head_kv = hparams.n_head_kv();
  3764. int attn_key_value_size;
  3765. if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
  3766. attn_key_value_size = attn_hidden_size;
  3767. } else {
  3768. attn_key_value_size = n_head_kv * head_size;
  3769. }
  3770. for (int i = 0; i < n_layer; ++i) {
  3771. auto & layer = layers[i];
  3772. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3773. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
  3774. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
  3775. layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
  3776. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
  3777. layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, TENSOR_NOT_REQUIRED);
  3778. layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
  3779. layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
  3780. layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
  3781. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
  3782. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
  3783. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  3784. layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
  3785. // optional bias tensors
  3786. layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
  3787. layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
  3788. layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, TENSOR_NOT_REQUIRED);
  3789. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  3790. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3791. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3792. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3793. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3794. }
  3795. } break;
  3796. case LLM_ARCH_RWKV7:
  3797. {
  3798. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3799. // Block 0, LN0
  3800. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  3801. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  3802. // output
  3803. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3804. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  3805. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3806. const int n_lora_decay = hparams.n_lora_decay;
  3807. const int n_lora_iclr = hparams.n_lora_iclr;
  3808. const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
  3809. const int n_lora_gate = hparams.n_lora_gate;
  3810. const int attn_hidden_size = n_embd;
  3811. const int ffn_size = hparams.n_ff_arr[0];
  3812. for (int i = 0; i < n_layer; ++i) {
  3813. auto & layer = layers[i];
  3814. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3815. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  3816. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
  3817. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
  3818. layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
  3819. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
  3820. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
  3821. layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
  3822. layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
  3823. layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
  3824. if (i == 0) {
  3825. // actually not used
  3826. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  3827. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
  3828. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
  3829. } else {
  3830. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  3831. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
  3832. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
  3833. }
  3834. layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, 0);
  3835. layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, 0);
  3836. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
  3837. layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
  3838. layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
  3839. layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
  3840. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  3841. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  3842. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  3843. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
  3844. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
  3845. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  3846. layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
  3847. layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
  3848. layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
  3849. }
  3850. } break;
  3851. case LLM_ARCH_ARWKV7:
  3852. {
  3853. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3854. // output
  3855. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3856. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  3857. const int n_lora_decay = hparams.n_lora_decay;
  3858. const int n_lora_iclr = hparams.n_lora_iclr;
  3859. const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
  3860. const int n_lora_gate = hparams.n_lora_gate;
  3861. const int attn_hidden_size = n_embd;
  3862. for (int i = 0; i < n_layer; ++i) {
  3863. auto & layer = layers[i];
  3864. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3865. layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
  3866. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
  3867. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
  3868. layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
  3869. layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
  3870. layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
  3871. if (i == 0) {
  3872. // actually not used
  3873. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  3874. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
  3875. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
  3876. } else {
  3877. layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
  3878. layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
  3879. layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
  3880. }
  3881. layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, TENSOR_NOT_REQUIRED);
  3882. layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, TENSOR_NOT_REQUIRED);
  3883. try {
  3884. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
  3885. } catch(std::runtime_error & e) {
  3886. // ARWKV models may not have gate tensors
  3887. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
  3888. }
  3889. layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
  3890. layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
  3891. layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
  3892. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  3893. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  3894. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  3895. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3896. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  3897. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  3898. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3899. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3900. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3901. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3902. }
  3903. } break;
  3904. case LLM_ARCH_CHAMELEON:
  3905. {
  3906. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  3907. // output
  3908. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  3909. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  3910. // if output is NULL, init from the input tok embed
  3911. if (output == NULL) {
  3912. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  3913. }
  3914. for (int i = 0; i < n_layer; ++i) {
  3915. auto & layer = layers[i];
  3916. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  3917. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
  3918. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
  3919. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
  3920. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
  3921. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  3922. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  3923. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  3924. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  3925. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  3926. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  3927. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  3928. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  3929. }
  3930. } break;
  3931. case LLM_ARCH_WAVTOKENIZER_DEC:
  3932. {
  3933. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
  3934. conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
  3935. conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
  3936. // posnet
  3937. {
  3938. const int64_t n_embd = hparams.posnet.n_embd;
  3939. for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
  3940. auto & layer = layers[i].posnet;
  3941. // posnet:
  3942. //
  3943. // - resnet
  3944. // - resnet
  3945. // - attn
  3946. // - resnet
  3947. // - resnet
  3948. // - norm
  3949. //
  3950. switch (i) {
  3951. case 0:
  3952. case 1:
  3953. case 3:
  3954. case 4:
  3955. {
  3956. layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
  3957. layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
  3958. layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
  3959. layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
  3960. layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
  3961. layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
  3962. layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
  3963. layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
  3964. } break;
  3965. case 2:
  3966. {
  3967. layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
  3968. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
  3969. layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
  3970. layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
  3971. layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
  3972. layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
  3973. layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
  3974. layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
  3975. layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
  3976. layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
  3977. } break;
  3978. case 5:
  3979. {
  3980. layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
  3981. layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
  3982. } break;
  3983. default: GGML_ABORT("unknown posnet layer");
  3984. };
  3985. }
  3986. }
  3987. GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
  3988. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
  3989. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
  3990. // convnext
  3991. {
  3992. const int64_t n_embd = hparams.convnext.n_embd;
  3993. for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
  3994. auto & layer = layers[i].convnext;
  3995. layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
  3996. layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
  3997. layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
  3998. layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
  3999. layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
  4000. layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
  4001. layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
  4002. layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
  4003. layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
  4004. }
  4005. // output
  4006. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4007. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  4008. }
  4009. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
  4010. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
  4011. } break;
  4012. case LLM_ARCH_BAILINGMOE:
  4013. {
  4014. const int64_t n_ff_exp = hparams.n_ff_exp;
  4015. const int64_t n_expert_shared = hparams.n_expert_shared;
  4016. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4017. // output
  4018. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4019. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4020. for (int i = 0; i < n_layer; ++i) {
  4021. auto & layer = layers[i];
  4022. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4023. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
  4024. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  4025. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
  4026. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
  4027. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4028. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4029. if (n_expert == 0) {
  4030. throw std::runtime_error("n_expert must be > 0");
  4031. }
  4032. if (n_expert_used == 0) {
  4033. throw std::runtime_error("n_expert_used must be > 0");
  4034. }
  4035. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4036. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  4037. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4038. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4039. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  4040. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4041. }
  4042. } break;
  4043. case LLM_ARCH_DOTS1:
  4044. {
  4045. const int64_t n_ff_exp = hparams.n_ff_exp;
  4046. const int64_t n_expert_shared = hparams.n_expert_shared;
  4047. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4048. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4049. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  4050. for (int i = 0; i < n_layer; ++i) {
  4051. auto & layer = layers[i];
  4052. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4053. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4054. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4055. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4056. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4057. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4058. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4059. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4060. if (i < (int) hparams.n_layer_dense_lead) {
  4061. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4062. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4063. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4064. } else {
  4065. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4066. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  4067. if (n_expert == 0) {
  4068. throw std::runtime_error("n_expert must be > 0");
  4069. }
  4070. if (n_expert_used == 0) {
  4071. throw std::runtime_error("n_expert_used must be > 0");
  4072. }
  4073. // MoE branch
  4074. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4075. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  4076. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  4077. // Shared expert branch
  4078. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4079. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  4080. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  4081. }
  4082. }
  4083. } break;
  4084. case LLM_ARCH_ARCEE:
  4085. {
  4086. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4087. // output
  4088. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4089. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4090. // if output is NULL, init from the input tok embed
  4091. if (output == NULL) {
  4092. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4093. }
  4094. for (int i = 0; i < n_layer; ++i) {
  4095. auto & layer = layers[i];
  4096. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4097. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4098. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4099. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4100. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4101. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4102. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4103. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4104. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4105. }
  4106. } break;
  4107. case LLM_ARCH_ERNIE4_5:
  4108. case LLM_ARCH_ERNIE4_5_MOE:
  4109. {
  4110. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4111. // output
  4112. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4113. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4114. // if output is NULL, init from the input tok embed
  4115. if (output == NULL) {
  4116. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4117. }
  4118. for (int i = 0; i < n_layer; ++i) {
  4119. auto & layer = layers[i];
  4120. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4121. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4122. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  4123. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  4124. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4125. // optional bias tensors
  4126. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4127. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4128. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  4129. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  4130. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4131. if (arch == LLM_ARCH_ERNIE4_5_MOE && static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
  4132. int n_ff_exp = hparams.n_ff_exp;
  4133. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4134. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  4135. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
  4136. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
  4137. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
  4138. // Shared expert (if present)
  4139. if (hparams.n_ff_shexp > 0) {
  4140. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
  4141. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd }, 0);
  4142. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
  4143. }
  4144. } else { // Dense layers
  4145. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4146. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4147. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4148. }
  4149. }
  4150. } break;
  4151. case LLM_ARCH_FALCON_H1:
  4152. {
  4153. // Common
  4154. const int64_t hidden_size = hparams.n_embd; // hidden_size
  4155. // mamba2 Mixer SSM params
  4156. const int64_t ssm_conv_kernel_size = hparams.ssm_d_conv; // ssm_conv_kernel_size
  4157. const int64_t ssm_n_groups = hparams.ssm_n_group; // ssm_n_groups
  4158. const int64_t ssm_state_size = hparams.ssm_d_state; // ssm_state_size
  4159. const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand
  4160. const int64_t ssm_num_heads = hparams.ssm_dt_rank; // ssm_num_heads
  4161. const int64_t ssm_conv_dim = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size;
  4162. const int64_t ssm_projection_size = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads;
  4163. // attn params
  4164. const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head
  4165. const int64_t attn_num_key_value_head = hparams.n_head_kv(0);
  4166. // ffn params
  4167. const int64_t ffn_intermediate_size = hparams.n_ff(0);
  4168. // embeddings
  4169. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0);
  4170. // output
  4171. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED);
  4172. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0);
  4173. // if output is NULL, init from the input tok embed
  4174. if (output == NULL) {
  4175. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED);
  4176. }
  4177. for (int i = 0; i < n_layer; ++i) {
  4178. auto & layer = layers[i];
  4179. /*SSM LAYERS*/
  4180. // ssm in
  4181. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0);
  4182. // ssm 1d conv
  4183. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0);
  4184. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED);
  4185. // ssm_dt
  4186. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0);
  4187. // no "weight" suffix for these
  4188. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0);
  4189. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0);
  4190. // ssm_norm
  4191. layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED);
  4192. // out_proj
  4193. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0);
  4194. /*ATTENTION LAYERS*/
  4195. // attention layers (with optional bias)
  4196. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {hidden_size, n_embd_head_k * attn_num_attention_head}, 0);
  4197. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0);
  4198. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0);
  4199. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
  4200. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
  4201. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {attn_num_key_value_head * n_embd_head_k}, TENSOR_NOT_REQUIRED);
  4202. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {attn_num_key_value_head * n_embd_head_v}, TENSOR_NOT_REQUIRED);
  4203. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
  4204. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
  4205. // feed forward (w/ optional biases)
  4206. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0);
  4207. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  4208. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
  4209. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_intermediate_size, hidden_size}, 0);
  4210. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
  4211. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
  4212. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
  4213. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
  4214. }
  4215. } break;
  4216. case LLM_ARCH_HUNYUAN_MOE:
  4217. {
  4218. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4219. // output
  4220. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4221. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4222. // if output is NULL, init from the input tok embed
  4223. if (output == NULL) {
  4224. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4225. }
  4226. for (int i = 0; i < n_layer; ++i) {
  4227. auto & layer = layers[i];
  4228. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4229. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4230. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4231. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4232. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4233. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4234. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4235. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4236. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  4237. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  4238. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  4239. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  4240. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  4241. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
  4242. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
  4243. }
  4244. } break;
  4245. case LLM_ARCH_SMOLLM3:
  4246. {
  4247. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4248. // output
  4249. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  4250. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  4251. // if output is NULL, init from the input tok embed
  4252. if (output == NULL) {
  4253. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  4254. }
  4255. for (int i = 0; i < n_layer; ++i) {
  4256. auto & layer = layers[i];
  4257. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4258. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  4259. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  4260. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  4261. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  4262. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4263. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4264. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4265. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4266. }
  4267. } break;
  4268. case LLM_ARCH_LFM2:
  4269. {
  4270. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  4271. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  4272. for (int i = 0; i < n_layer; ++i) {
  4273. auto & layer = layers[i];
  4274. // ffn is same for transformer and conv layers
  4275. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  4276. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  4277. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  4278. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  4279. // for operator_norm
  4280. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  4281. if (!hparams.is_recurrent(i)) {
  4282. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  4283. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  4284. GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
  4285. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  4286. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, hparams.n_embd_k_gqa(i)}, 0);
  4287. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, hparams.n_embd_v_gqa(i)}, 0);
  4288. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  4289. } else {
  4290. layer.shortconv.conv = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV, "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0);
  4291. layer.shortconv.in_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ, "weight", i), {n_embd, 3 * n_embd}, 0);
  4292. layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
  4293. }
  4294. }
  4295. } break;
  4296. default:
  4297. throw std::runtime_error("unknown architecture");
  4298. }
  4299. if (n_moved_tensors > 0) {
  4300. LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
  4301. __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
  4302. ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
  4303. }
  4304. }
  4305. ml.done_getting_tensors();
  4306. ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
  4307. pimpl->mappings.reserve(ml.mappings.size());
  4308. // create the backend buffers
  4309. std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
  4310. ctx_bufs.reserve(ctx_map.size());
  4311. // Ensure we have enough capacity for the maximum backend buffer we will potentially create
  4312. const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
  4313. pimpl->bufs.reserve(n_max_backend_buffer);
  4314. for (auto & it : ctx_map) {
  4315. ggml_backend_buffer_type_t buft = it.first;
  4316. ggml_context * ctx = it.second;
  4317. // skip contexts without tensors
  4318. if (ggml_get_first_tensor(ctx) == nullptr) {
  4319. continue;
  4320. }
  4321. llama_buf_map buf_map;
  4322. buf_map.reserve(n_max_backend_buffer);
  4323. // check if it is possible to use buffer_from_host_ptr with this buffer type
  4324. ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
  4325. if (!dev) {
  4326. // FIXME: workaround for CPU backend buft having a NULL device
  4327. dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  4328. if (!dev) {
  4329. throw std::runtime_error(format("%s: no CPU backend found", __func__));
  4330. }
  4331. }
  4332. ggml_backend_dev_props props;
  4333. ggml_backend_dev_get_props(dev, &props);
  4334. bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
  4335. bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
  4336. if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
  4337. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  4338. // only the mmap region containing the tensors in the model is mapped to the backend buffer
  4339. // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
  4340. // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
  4341. void * addr = nullptr;
  4342. size_t first, last; // NOLINT
  4343. ml.get_mapping_range(&first, &last, &addr, idx, ctx);
  4344. if (first >= last) {
  4345. continue;
  4346. }
  4347. const size_t max_size = ggml_get_max_tensor_size(ctx);
  4348. ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
  4349. if (buf == nullptr) {
  4350. throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
  4351. }
  4352. pimpl->bufs.emplace_back(buf);
  4353. buf_map.emplace(idx, buf);
  4354. }
  4355. }
  4356. else {
  4357. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
  4358. if (buf == nullptr) {
  4359. throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
  4360. }
  4361. pimpl->bufs.emplace_back(buf);
  4362. if (use_mlock && ggml_backend_buffer_is_host(buf)) {
  4363. pimpl->mlock_bufs.emplace_back(new llama_mlock);
  4364. auto & mlock_buf = pimpl->mlock_bufs.back();
  4365. mlock_buf->init (ggml_backend_buffer_get_base(buf));
  4366. mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
  4367. }
  4368. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  4369. buf_map.emplace(idx, buf);
  4370. }
  4371. }
  4372. if (pimpl->bufs.empty()) {
  4373. throw std::runtime_error("failed to allocate buffer");
  4374. }
  4375. for (auto & buf : buf_map) {
  4376. // indicate that this buffer contains weights
  4377. // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
  4378. ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
  4379. }
  4380. ctx_bufs.emplace_back(ctx, buf_map);
  4381. }
  4382. if (llama_supports_gpu_offload()) {
  4383. const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
  4384. LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
  4385. if (n_gpu_layers > (int) hparams.n_layer) {
  4386. LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
  4387. }
  4388. const int max_backend_supported_layers = hparams.n_layer + 1;
  4389. const int max_offloadable_layers = hparams.n_layer + 1;
  4390. LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
  4391. }
  4392. // print memory requirements per buffer type
  4393. for (auto & buf : pimpl->bufs) {
  4394. LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
  4395. }
  4396. // populate tensors_by_name
  4397. for (auto & ctx : pimpl->ctxs) {
  4398. for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
  4399. tensors_by_name.emplace_back(ggml_get_name(cur), cur);
  4400. }
  4401. }
  4402. // load tensor data
  4403. for (auto & it : ctx_bufs) {
  4404. ggml_context * ctx = it.first;
  4405. auto & bufs = it.second;
  4406. if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
  4407. return false;
  4408. }
  4409. }
  4410. if (use_mmap_buffer) {
  4411. for (auto & mapping : ml.mappings) {
  4412. pimpl->mappings.emplace_back(std::move(mapping));
  4413. }
  4414. }
  4415. return true;
  4416. }
  4417. std::string llama_model::arch_name() const {
  4418. return llm_arch_name(arch);
  4419. }
  4420. std::string llama_model::type_name() const {
  4421. return llm_type_name(type);
  4422. }
  4423. std::string llama_model::desc() const {
  4424. return pimpl->desc_str;
  4425. }
  4426. size_t llama_model::size() const {
  4427. return pimpl->n_bytes;
  4428. }
  4429. size_t llama_model::n_tensors() const {
  4430. return tensors_by_name.size();
  4431. }
  4432. size_t llama_model::n_devices() const {
  4433. return devices.size();
  4434. }
  4435. uint64_t llama_model::n_elements() const {
  4436. return pimpl->n_elements;
  4437. }
  4438. void llama_model::print_info() const {
  4439. const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
  4440. auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
  4441. bool is_var = false;
  4442. std::vector<uint32_t> v;
  4443. for (uint32_t i = 0; i < n; ++i) {
  4444. v.push_back(f(i));
  4445. if (v[i] != v[0]) {
  4446. is_var = true;
  4447. }
  4448. }
  4449. std::stringstream ss;
  4450. if (is_var) {
  4451. ss << "[";
  4452. for (uint32_t i = 0; i < n; ++i) {
  4453. ss << v[i];
  4454. if (i < n - 1) {
  4455. ss << ", ";
  4456. }
  4457. }
  4458. ss << "]";
  4459. } else {
  4460. ss << v[0];
  4461. }
  4462. return ss.str();
  4463. };
  4464. // hparams
  4465. LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
  4466. LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
  4467. if (!hparams.vocab_only) {
  4468. LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
  4469. LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
  4470. LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
  4471. LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
  4472. LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
  4473. LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
  4474. LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
  4475. LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
  4476. LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
  4477. LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
  4478. LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
  4479. LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
  4480. LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
  4481. LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
  4482. LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
  4483. LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
  4484. LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
  4485. LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
  4486. LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
  4487. LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
  4488. LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
  4489. LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
  4490. LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
  4491. LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
  4492. LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
  4493. LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
  4494. LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
  4495. LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
  4496. LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
  4497. LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
  4498. if (!classifier_labels.empty()) {
  4499. LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
  4500. size_t i = 0;
  4501. for (auto label : classifier_labels) {
  4502. LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
  4503. }
  4504. }
  4505. }
  4506. if (arch == LLM_ARCH_MAMBA ||
  4507. arch == LLM_ARCH_MAMBA2 ||
  4508. arch == LLM_ARCH_JAMBA ||
  4509. arch == LLM_ARCH_FALCON_H1 ||
  4510. arch == LLM_ARCH_PLAMO2 ||
  4511. arch == LLM_ARCH_GRANITE_HYBRID) {
  4512. LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
  4513. LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
  4514. LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
  4515. LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
  4516. LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
  4517. LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
  4518. }
  4519. LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
  4520. if (pimpl->n_elements >= 1e12) {
  4521. LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
  4522. } else if (pimpl->n_elements >= 1e9) {
  4523. LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
  4524. } else if (pimpl->n_elements >= 1e6) {
  4525. LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
  4526. } else {
  4527. LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
  4528. }
  4529. // general kv
  4530. LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
  4531. if (arch == LLM_ARCH_DEEPSEEK) {
  4532. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  4533. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  4534. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  4535. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  4536. }
  4537. if (arch == LLM_ARCH_DEEPSEEK2) {
  4538. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  4539. LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
  4540. LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
  4541. LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
  4542. LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
  4543. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  4544. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  4545. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  4546. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  4547. LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
  4548. LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
  4549. }
  4550. if (arch == LLM_ARCH_QWEN2MOE) {
  4551. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  4552. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  4553. }
  4554. if (arch == LLM_ARCH_QWEN3MOE) {
  4555. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  4556. }
  4557. if (arch == LLM_ARCH_MINICPM ||
  4558. arch == LLM_ARCH_GRANITE ||
  4559. arch == LLM_ARCH_GRANITE_MOE ||
  4560. arch == LLM_ARCH_GRANITE_HYBRID) {
  4561. LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
  4562. LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
  4563. LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
  4564. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  4565. }
  4566. if (arch == LLM_ARCH_BAILINGMOE) {
  4567. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  4568. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  4569. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  4570. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  4571. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  4572. }
  4573. vocab.print_info();
  4574. }
  4575. ggml_backend_dev_t llama_model::dev_layer(int il) const {
  4576. return pimpl->dev_layer.at(il).dev;
  4577. }
  4578. ggml_backend_dev_t llama_model::dev_output() const {
  4579. return pimpl->dev_output.dev;
  4580. }
  4581. template<typename F>
  4582. static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
  4583. ggml_init_params params = {
  4584. /*.mem_size =*/ ggml_tensor_overhead()*8,
  4585. /*.mem_buffer =*/ NULL,
  4586. /*.no_alloc =*/ true,
  4587. };
  4588. ggml_context_ptr ctx { ggml_init(params) };
  4589. if (!ctx) {
  4590. throw std::runtime_error(format("failed to create ggml context"));
  4591. }
  4592. ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
  4593. ggml_tensor * op_tensor = fn(ctx.get());
  4594. for (int i = 0; i < GGML_MAX_SRC; i++) {
  4595. if (op_tensor->src[i] != nullptr) {
  4596. assert(op_tensor->src[i]->buffer == nullptr);
  4597. op_tensor->src[i]->buffer = buf.get();
  4598. }
  4599. }
  4600. bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  4601. return op_supported;
  4602. }
  4603. template<typename F>
  4604. static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
  4605. for (const auto & cur : buft_list) {
  4606. ggml_backend_dev_t cur_dev = cur.first;
  4607. ggml_backend_buffer_type_t cur_buft = cur.second;
  4608. if (buft_supported(cur_buft, cur_dev, fn)) {
  4609. return cur_buft;
  4610. }
  4611. }
  4612. throw std::runtime_error(format("no suitable buffer type found"));
  4613. }
  4614. ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
  4615. return ::select_buft(
  4616. *pimpl->dev_layer.at(il).buft_list,
  4617. [&](ggml_context * ctx) {
  4618. ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  4619. ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  4620. return ggml_add(ctx, cur, layer_dir);
  4621. });
  4622. }
  4623. bool llama_model::has_tensor_overrides() const {
  4624. return pimpl->has_tensor_overrides;
  4625. }
  4626. const ggml_tensor * llama_model::get_tensor(const char * name) const {
  4627. auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
  4628. [name](const std::pair<std::string, ggml_tensor *> & it) {
  4629. return it.first == name;
  4630. });
  4631. if (it == tensors_by_name.end()) {
  4632. return nullptr;
  4633. }
  4634. return it->second;
  4635. }
  4636. float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
  4637. return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
  4638. }
  4639. float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
  4640. return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
  4641. }
  4642. ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
  4643. const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
  4644. // choose long/short freq factors based on the context size
  4645. if (layers[il].rope_freqs != nullptr) {
  4646. return layers[il].rope_freqs;
  4647. }
  4648. if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
  4649. return layers[il].rope_long;
  4650. }
  4651. return layers[il].rope_short;
  4652. }
  4653. struct llm_build_llama : public llm_graph_context {
  4654. llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  4655. const int64_t n_embd_head = hparams.n_embd_head_v;
  4656. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4657. GGML_ASSERT(n_embd_head == hparams.n_rot);
  4658. ggml_tensor * cur;
  4659. ggml_tensor * inpL;
  4660. inpL = build_inp_embd(model.tok_embd);
  4661. // inp_pos - contains the positions
  4662. ggml_tensor * inp_pos = build_inp_pos();
  4663. auto * inp_attn = build_attn_inp_kv_unified();
  4664. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  4665. ggml_tensor * inp_out_ids = build_inp_out_ids();
  4666. for (int il = 0; il < n_layer; ++il) {
  4667. ggml_tensor * inpSA = inpL;
  4668. // norm
  4669. cur = build_norm(inpL,
  4670. model.layers[il].attn_norm, NULL,
  4671. LLM_NORM_RMS, il);
  4672. cb(cur, "attn_norm", il);
  4673. // self-attention
  4674. {
  4675. // rope freq factors for llama3; may return nullptr for llama2 and other models
  4676. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  4677. // compute Q and K and RoPE them
  4678. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  4679. cb(Qcur, "Qcur", il);
  4680. if (model.layers[il].bq) {
  4681. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  4682. cb(Qcur, "Qcur", il);
  4683. }
  4684. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  4685. cb(Kcur, "Kcur", il);
  4686. if (model.layers[il].bk) {
  4687. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  4688. cb(Kcur, "Kcur", il);
  4689. }
  4690. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  4691. cb(Vcur, "Vcur", il);
  4692. if (model.layers[il].bv) {
  4693. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  4694. cb(Vcur, "Vcur", il);
  4695. }
  4696. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4697. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4698. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4699. Qcur = ggml_rope_ext(
  4700. ctx0, Qcur, inp_pos, rope_factors,
  4701. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4702. ext_factor, attn_factor, beta_fast, beta_slow
  4703. );
  4704. Kcur = ggml_rope_ext(
  4705. ctx0, Kcur, inp_pos, rope_factors,
  4706. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4707. ext_factor, attn_factor, beta_fast, beta_slow
  4708. );
  4709. cb(Qcur, "Qcur", il);
  4710. cb(Kcur, "Kcur", il);
  4711. cb(Vcur, "Vcur", il);
  4712. cur = build_attn(inp_attn,
  4713. model.layers[il].wo, model.layers[il].bo,
  4714. Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
  4715. cb(cur, "attn_out", il);
  4716. }
  4717. if (il == n_layer - 1 && inp_out_ids) {
  4718. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4719. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  4720. }
  4721. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  4722. cb(ffn_inp, "ffn_inp", il);
  4723. // feed-forward network (non-MoE)
  4724. if (model.layers[il].ffn_gate_inp == nullptr) {
  4725. cur = build_norm(ffn_inp,
  4726. model.layers[il].ffn_norm, NULL,
  4727. LLM_NORM_RMS, il);
  4728. cb(cur, "ffn_norm", il);
  4729. cur = build_ffn(cur,
  4730. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  4731. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  4732. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  4733. NULL,
  4734. LLM_FFN_SILU, LLM_FFN_PAR, il);
  4735. cb(cur, "ffn_out", il);
  4736. } else {
  4737. // MoE branch
  4738. cur = build_norm(ffn_inp,
  4739. model.layers[il].ffn_norm, NULL,
  4740. LLM_NORM_RMS, il);
  4741. cb(cur, "ffn_norm", il);
  4742. cur = build_moe_ffn(cur,
  4743. model.layers[il].ffn_gate_inp,
  4744. model.layers[il].ffn_up_exps,
  4745. model.layers[il].ffn_gate_exps,
  4746. model.layers[il].ffn_down_exps,
  4747. nullptr,
  4748. n_expert, n_expert_used,
  4749. LLM_FFN_SILU, true,
  4750. false, 0.0,
  4751. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  4752. il);
  4753. cb(cur, "ffn_moe_out", il);
  4754. }
  4755. cur = ggml_add(ctx0, cur, ffn_inp);
  4756. cb(cur, "ffn_out", il);
  4757. cur = build_cvec(cur, il);
  4758. cb(cur, "l_out", il);
  4759. // input for next layer
  4760. inpL = cur;
  4761. }
  4762. cur = inpL;
  4763. cur = build_norm(cur,
  4764. model.output_norm, NULL,
  4765. LLM_NORM_RMS, -1);
  4766. cb(cur, "result_norm", -1);
  4767. res->t_embd = cur;
  4768. // lm_head
  4769. cur = build_lora_mm(model.output, cur);
  4770. cb(cur, "result_output", -1);
  4771. res->t_logits = cur;
  4772. ggml_build_forward_expand(gf, cur);
  4773. }
  4774. };
  4775. struct llm_build_llama_iswa : public llm_graph_context {
  4776. llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  4777. const int64_t n_embd_head = hparams.n_embd_head_v;
  4778. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4779. GGML_ASSERT(n_embd_head == hparams.n_rot);
  4780. ggml_tensor * cur;
  4781. ggml_tensor * inpL;
  4782. inpL = build_inp_embd(model.tok_embd);
  4783. // inp_pos - contains the positions
  4784. ggml_tensor * inp_pos = build_inp_pos();
  4785. // temperature tuning
  4786. ggml_tensor * inp_attn_scale = nullptr;
  4787. inp_attn_scale = build_inp_attn_scale();
  4788. auto * inp_attn = build_attn_inp_kv_unified_iswa();
  4789. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  4790. ggml_tensor * inp_out_ids = build_inp_out_ids();
  4791. for (int il = 0; il < n_layer; ++il) {
  4792. ggml_tensor * inpSA = inpL;
  4793. const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
  4794. // norm
  4795. cur = build_norm(inpL,
  4796. model.layers[il].attn_norm, NULL,
  4797. LLM_NORM_RMS, il);
  4798. cb(cur, "attn_norm", il);
  4799. // self-attention
  4800. {
  4801. // rope freq factors for llama3; may return nullptr for llama2 and other models
  4802. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  4803. // compute Q and K and RoPE them
  4804. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  4805. cb(Qcur, "Qcur", il);
  4806. if (model.layers[il].bq) {
  4807. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  4808. cb(Qcur, "Qcur", il);
  4809. }
  4810. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  4811. cb(Kcur, "Kcur", il);
  4812. if (model.layers[il].bk) {
  4813. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  4814. cb(Kcur, "Kcur", il);
  4815. }
  4816. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  4817. cb(Vcur, "Vcur", il);
  4818. if (model.layers[il].bv) {
  4819. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  4820. cb(Vcur, "Vcur", il);
  4821. }
  4822. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4823. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4824. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4825. if (use_rope) {
  4826. Qcur = ggml_rope_ext(
  4827. ctx0, Qcur, inp_pos, rope_factors,
  4828. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4829. ext_factor, attn_factor, beta_fast, beta_slow
  4830. );
  4831. Kcur = ggml_rope_ext(
  4832. ctx0, Kcur, inp_pos, rope_factors,
  4833. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4834. ext_factor, attn_factor, beta_fast, beta_slow
  4835. );
  4836. } else if (inp_attn_scale) {
  4837. Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
  4838. }
  4839. cb(Qcur, "Qcur", il);
  4840. cb(Kcur, "Kcur", il);
  4841. cb(Vcur, "Vcur", il);
  4842. if (use_rope && hparams.use_kq_norm) {
  4843. // Llama4TextL2Norm
  4844. Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
  4845. Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
  4846. cb(Qcur, "Qcur_normed", il);
  4847. cb(Kcur, "Kcur_normed", il);
  4848. }
  4849. cur = build_attn(inp_attn,
  4850. model.layers[il].wo, model.layers[il].bo,
  4851. Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
  4852. cb(cur, "attn_out", il);
  4853. }
  4854. if (il == n_layer - 1 && inp_out_ids) {
  4855. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4856. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  4857. }
  4858. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  4859. cb(ffn_inp, "ffn_inp", il);
  4860. // feed-forward network (non-MoE)
  4861. if (model.layers[il].ffn_gate_inp == nullptr) {
  4862. cur = build_norm(ffn_inp,
  4863. model.layers[il].ffn_norm, NULL,
  4864. LLM_NORM_RMS, il);
  4865. cb(cur, "ffn_norm", il);
  4866. cur = build_ffn(cur,
  4867. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  4868. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  4869. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  4870. NULL,
  4871. LLM_FFN_SILU, LLM_FFN_PAR, il);
  4872. cb(cur, "ffn_out", il);
  4873. } else {
  4874. ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
  4875. model.layers[il].ffn_norm, NULL,
  4876. LLM_NORM_RMS, il);
  4877. cb(cur, "ffn_norm", il);
  4878. ggml_tensor * moe_out = build_moe_ffn(ffn_inp_normed,
  4879. model.layers[il].ffn_gate_inp,
  4880. model.layers[il].ffn_up_exps,
  4881. model.layers[il].ffn_gate_exps,
  4882. model.layers[il].ffn_down_exps,
  4883. nullptr,
  4884. n_expert, n_expert_used,
  4885. LLM_FFN_SILU, false,
  4886. false, 0.0,
  4887. LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
  4888. il);
  4889. // Shared experts
  4890. ggml_tensor * shexp_out = build_ffn(ffn_inp_normed,
  4891. model.layers[il].ffn_up_shexp, NULL, NULL,
  4892. model.layers[il].ffn_gate_shexp, NULL, NULL,
  4893. model.layers[il].ffn_down_shexp, NULL, NULL,
  4894. NULL,
  4895. LLM_FFN_SILU, LLM_FFN_PAR, il);
  4896. cb(shexp_out, "ffn_moe_shexp", il);
  4897. cur = ggml_add(ctx0, moe_out, shexp_out);
  4898. cb(cur, "ffn_moe_out_merged", il);
  4899. }
  4900. cur = ggml_add(ctx0, cur, ffn_inp);
  4901. cb(cur, "ffn_out", il);
  4902. cur = build_cvec(cur, il);
  4903. cb(cur, "l_out", il);
  4904. // input for next layer
  4905. inpL = cur;
  4906. }
  4907. cur = inpL;
  4908. cur = build_norm(cur,
  4909. model.output_norm, NULL,
  4910. LLM_NORM_RMS, -1);
  4911. cb(cur, "result_norm", -1);
  4912. res->t_embd = cur;
  4913. // lm_head
  4914. cur = build_lora_mm(model.output, cur);
  4915. cb(cur, "result_output", -1);
  4916. res->t_logits = cur;
  4917. ggml_build_forward_expand(gf, cur);
  4918. }
  4919. };
  4920. struct llm_build_deci : public llm_graph_context {
  4921. llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  4922. const int64_t n_embd_head = hparams.n_embd_head_v;
  4923. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4924. GGML_ASSERT(n_embd_head == hparams.n_rot);
  4925. ggml_tensor * cur;
  4926. ggml_tensor * inpL;
  4927. inpL = build_inp_embd(model.tok_embd);
  4928. // inp_pos - contains the positions
  4929. ggml_tensor * inp_pos = build_inp_pos();
  4930. auto * inp_attn = build_attn_inp_kv_unified();
  4931. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  4932. ggml_tensor * inp_out_ids = build_inp_out_ids();
  4933. for (int il = 0; il < n_layer; ++il) {
  4934. ggml_tensor * inpSA = inpL;
  4935. const int64_t n_head_kv = hparams.n_head_kv(il);
  4936. const int64_t n_head = hparams.n_head(il);
  4937. const int64_t n_ff = hparams.n_ff(il);
  4938. if (n_head == 0) {
  4939. // attention-free layer of Llama-3_1-Nemotron-51B
  4940. cur = inpL;
  4941. } else {
  4942. // norm
  4943. cur = build_norm(inpL,
  4944. model.layers[il].attn_norm, NULL,
  4945. LLM_NORM_RMS, il);
  4946. cb(cur, "attn_norm", il);
  4947. }
  4948. if (n_head > 0 && n_head_kv == 0) {
  4949. // "linear attention" of Llama-3_1-Nemotron-51B
  4950. cur = build_lora_mm(model.layers[il].wo, cur);
  4951. cb(cur, "wo", il);
  4952. } else if (n_head > 0) {
  4953. // self-attention
  4954. // rope freq factors for llama3; may return nullptr for llama2 and other models
  4955. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  4956. // compute Q and K and RoPE them
  4957. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  4958. cb(Qcur, "Qcur", il);
  4959. if (model.layers[il].bq) {
  4960. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  4961. cb(Qcur, "Qcur", il);
  4962. }
  4963. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  4964. cb(Kcur, "Kcur", il);
  4965. if (model.layers[il].bk) {
  4966. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  4967. cb(Kcur, "Kcur", il);
  4968. }
  4969. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  4970. cb(Vcur, "Vcur", il);
  4971. if (model.layers[il].bv) {
  4972. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  4973. cb(Vcur, "Vcur", il);
  4974. }
  4975. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4976. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4977. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  4978. Qcur = ggml_rope_ext(
  4979. ctx0, Qcur, inp_pos, rope_factors,
  4980. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4981. ext_factor, attn_factor, beta_fast, beta_slow
  4982. );
  4983. Kcur = ggml_rope_ext(
  4984. ctx0, Kcur, inp_pos, rope_factors,
  4985. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4986. ext_factor, attn_factor, beta_fast, beta_slow
  4987. );
  4988. cb(Qcur, "Qcur", il);
  4989. cb(Kcur, "Kcur", il);
  4990. cb(Vcur, "Vcur", il);
  4991. cur = build_attn(inp_attn,
  4992. model.layers[il].wo, model.layers[il].bo,
  4993. Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
  4994. }
  4995. if (il == n_layer - 1 && inp_out_ids) {
  4996. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4997. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  4998. }
  4999. // FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
  5000. if (n_ff == 0) {
  5001. continue;
  5002. }
  5003. // modified to support attention-free layer of Llama-3_1-Nemotron-51B
  5004. ggml_tensor * ffn_inp = cur;
  5005. if (n_head > 0) {
  5006. ffn_inp = ggml_add(ctx0, cur, inpSA);
  5007. cb(ffn_inp, "ffn_inp", il);
  5008. }
  5009. // feed-forward network
  5010. if (model.layers[il].ffn_gate_inp == nullptr) {
  5011. cur = build_norm(ffn_inp,
  5012. model.layers[il].ffn_norm, NULL,
  5013. LLM_NORM_RMS, il);
  5014. cb(cur, "ffn_norm", il);
  5015. cur = build_ffn(cur,
  5016. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  5017. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  5018. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  5019. NULL,
  5020. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5021. cb(cur, "ffn_out", il);
  5022. }
  5023. cur = ggml_add(ctx0, cur, ffn_inp);
  5024. cb(cur, "ffn_out", il);
  5025. cur = build_cvec(cur, il);
  5026. cb(cur, "l_out", il);
  5027. // input for next layer
  5028. inpL = cur;
  5029. }
  5030. cur = inpL;
  5031. cur = build_norm(cur,
  5032. model.output_norm, NULL,
  5033. LLM_NORM_RMS, -1);
  5034. cb(cur, "result_norm", -1);
  5035. res->t_embd = cur;
  5036. // lm_head
  5037. cur = build_lora_mm(model.output, cur);
  5038. cb(cur, "result_output", -1);
  5039. res->t_logits = cur;
  5040. ggml_build_forward_expand(gf, cur);
  5041. }
  5042. };
  5043. struct llm_build_baichuan : public llm_graph_context {
  5044. llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5045. const int64_t n_embd_head = hparams.n_embd_head_v;
  5046. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5047. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5048. ggml_tensor * cur;
  5049. ggml_tensor * inpL;
  5050. inpL = build_inp_embd(model.tok_embd);
  5051. // inp_pos - contains the positions
  5052. ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
  5053. auto * inp_attn = build_attn_inp_kv_unified();
  5054. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5055. for (int il = 0; il < n_layer; ++il) {
  5056. ggml_tensor * inpSA = inpL;
  5057. cur = build_norm(inpL,
  5058. model.layers[il].attn_norm, NULL,
  5059. LLM_NORM_RMS, il);
  5060. cb(cur, "attn_norm", il);
  5061. // self-attention
  5062. {
  5063. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5064. cb(Qcur, "Qcur", il);
  5065. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5066. cb(Kcur, "Kcur", il);
  5067. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5068. cb(Vcur, "Vcur", il);
  5069. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5070. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5071. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5072. switch (model.type) {
  5073. case LLM_TYPE_7B:
  5074. Qcur = ggml_rope_ext(
  5075. ctx0, Qcur, inp_pos, nullptr,
  5076. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5077. ext_factor, attn_factor, beta_fast, beta_slow
  5078. );
  5079. Kcur = ggml_rope_ext(
  5080. ctx0, Kcur, inp_pos, nullptr,
  5081. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5082. ext_factor, attn_factor, beta_fast, beta_slow
  5083. );
  5084. break;
  5085. case LLM_TYPE_13B:
  5086. break;
  5087. default:
  5088. GGML_ABORT("fatal error");
  5089. }
  5090. cb(Qcur, "Qcur", il);
  5091. cb(Kcur, "Kcur", il);
  5092. cb(Vcur, "Vcur", il);
  5093. cur = build_attn(inp_attn,
  5094. model.layers[il].wo, NULL,
  5095. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5096. }
  5097. if (il == n_layer - 1 && inp_out_ids) {
  5098. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5099. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5100. }
  5101. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5102. cb(ffn_inp, "ffn_inp", il);
  5103. // feed-forward network
  5104. {
  5105. cur = build_norm(ffn_inp,
  5106. model.layers[il].ffn_norm, NULL,
  5107. LLM_NORM_RMS, il);
  5108. cb(cur, "ffn_norm", il);
  5109. cur = build_ffn(cur,
  5110. model.layers[il].ffn_up, NULL, NULL,
  5111. model.layers[il].ffn_gate, NULL, NULL,
  5112. model.layers[il].ffn_down, NULL, NULL,
  5113. NULL,
  5114. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5115. cb(cur, "ffn_out", il);
  5116. }
  5117. cur = ggml_add(ctx0, cur, ffn_inp);
  5118. cur = build_cvec(cur, il);
  5119. cb(cur, "l_out", il);
  5120. // input for next layer
  5121. inpL = cur;
  5122. }
  5123. cur = inpL;
  5124. cur = build_norm(cur,
  5125. model.output_norm, NULL,
  5126. LLM_NORM_RMS, -1);
  5127. cb(cur, "result_norm", -1);
  5128. res->t_embd = cur;
  5129. // lm_head
  5130. cur = build_lora_mm(model.output, cur);
  5131. cb(cur, "result_output", -1);
  5132. res->t_logits = cur;
  5133. ggml_build_forward_expand(gf, cur);
  5134. }
  5135. };
  5136. struct llm_build_xverse : public llm_graph_context {
  5137. llm_build_xverse(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5138. const int64_t n_embd_head = hparams.n_embd_head_v;
  5139. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5140. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5141. ggml_tensor * cur;
  5142. ggml_tensor * inpL;
  5143. inpL = build_inp_embd(model.tok_embd);
  5144. // inp_pos - contains the positions
  5145. ggml_tensor * inp_pos = build_inp_pos();
  5146. auto * inp_attn = build_attn_inp_kv_unified();
  5147. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5148. for (int il = 0; il < n_layer; ++il) {
  5149. ggml_tensor * inpSA = inpL;
  5150. cur = build_norm(inpL,
  5151. model.layers[il].attn_norm, NULL,
  5152. LLM_NORM_RMS, il);
  5153. cb(cur, "attn_norm", il);
  5154. // self-attention
  5155. {
  5156. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5157. cb(Qcur, "Qcur", il);
  5158. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5159. cb(Kcur, "Kcur", il);
  5160. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5161. cb(Vcur, "Vcur", il);
  5162. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5163. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5164. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5165. Qcur = ggml_rope_ext(
  5166. ctx0, Qcur, inp_pos, nullptr,
  5167. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5168. ext_factor, attn_factor, beta_fast, beta_slow
  5169. );
  5170. Kcur = ggml_rope_ext(
  5171. ctx0, Kcur, inp_pos, nullptr,
  5172. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5173. ext_factor, attn_factor, beta_fast, beta_slow
  5174. );
  5175. cb(Qcur, "Qcur", il);
  5176. cb(Kcur, "Kcur", il);
  5177. cb(Vcur, "Vcur", il);
  5178. cur = build_attn(inp_attn,
  5179. model.layers[il].wo, NULL,
  5180. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5181. }
  5182. if (il == n_layer - 1 && inp_out_ids) {
  5183. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5184. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5185. }
  5186. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5187. cb(ffn_inp, "ffn_inp", il);
  5188. // feed-forward network
  5189. {
  5190. cur = build_norm(ffn_inp,
  5191. model.layers[il].ffn_norm, NULL,
  5192. LLM_NORM_RMS, il);
  5193. cb(cur, "ffn_norm", il);
  5194. cur = build_ffn(cur,
  5195. model.layers[il].ffn_up, NULL, NULL,
  5196. model.layers[il].ffn_gate, NULL, NULL,
  5197. model.layers[il].ffn_down, NULL, NULL,
  5198. NULL,
  5199. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5200. cb(cur, "ffn_out", il);
  5201. }
  5202. cur = ggml_add(ctx0, cur, ffn_inp);
  5203. cur = build_cvec(cur, il);
  5204. cb(cur, "l_out", il);
  5205. // input for next layer
  5206. inpL = cur;
  5207. }
  5208. cur = inpL;
  5209. cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
  5210. cb(cur, "result_norm", -1);
  5211. res->t_embd = cur;
  5212. // lm_head
  5213. cur = build_lora_mm(model.output, cur);
  5214. cb(cur, "result_output", -1);
  5215. res->t_logits = cur;
  5216. ggml_build_forward_expand(gf, cur);
  5217. }
  5218. };
  5219. struct llm_build_falcon : public llm_graph_context {
  5220. llm_build_falcon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5221. const int64_t n_embd_head = hparams.n_embd_head_v;
  5222. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  5223. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5224. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5225. ggml_tensor * cur;
  5226. ggml_tensor * inpL;
  5227. inpL = build_inp_embd(model.tok_embd);
  5228. // inp_pos - contains the positions
  5229. ggml_tensor * inp_pos = build_inp_pos();
  5230. auto * inp_attn = build_attn_inp_kv_unified();
  5231. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5232. for (int il = 0; il < n_layer; ++il) {
  5233. ggml_tensor * attn_norm;
  5234. attn_norm = build_norm(inpL,
  5235. model.layers[il].attn_norm,
  5236. model.layers[il].attn_norm_b,
  5237. LLM_NORM, il);
  5238. cb(attn_norm, "attn_norm", il);
  5239. // self-attention
  5240. {
  5241. if (model.layers[il].attn_norm_2) {
  5242. // Falcon-40B
  5243. cur = build_norm(inpL,
  5244. model.layers[il].attn_norm_2,
  5245. model.layers[il].attn_norm_2_b,
  5246. LLM_NORM, il);
  5247. cb(cur, "attn_norm_2", il);
  5248. } else {
  5249. cur = attn_norm;
  5250. }
  5251. cur = build_lora_mm(model.layers[il].wqkv, cur);
  5252. cb(cur, "wqkv", il);
  5253. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  5254. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  5255. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  5256. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5257. // using mode = 2 for neox mode
  5258. Qcur = ggml_rope_ext(
  5259. ctx0, Qcur, inp_pos, nullptr,
  5260. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5261. ext_factor, attn_factor, beta_fast, beta_slow
  5262. );
  5263. Kcur = ggml_rope_ext(
  5264. ctx0, Kcur, inp_pos, nullptr,
  5265. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5266. ext_factor, attn_factor, beta_fast, beta_slow
  5267. );
  5268. cb(Qcur, "Qcur", il);
  5269. cb(Kcur, "Kcur", il);
  5270. cb(Vcur, "Vcur", il);
  5271. cur = build_attn(inp_attn,
  5272. model.layers[il].wo, NULL,
  5273. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5274. }
  5275. if (il == n_layer - 1 && inp_out_ids) {
  5276. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5277. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  5278. attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
  5279. }
  5280. ggml_tensor * ffn_inp = cur;
  5281. // feed forward
  5282. {
  5283. cur = build_ffn(attn_norm, // !! use the attn norm, not the result
  5284. model.layers[il].ffn_up, NULL, NULL,
  5285. NULL, NULL, NULL,
  5286. model.layers[il].ffn_down, NULL, NULL,
  5287. NULL,
  5288. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  5289. cb(cur, "ffn_out", il);
  5290. }
  5291. cur = ggml_add(ctx0, cur, ffn_inp);
  5292. cur = ggml_add(ctx0, cur, inpL);
  5293. cur = build_cvec(cur, il);
  5294. cb(cur, "l_out", il);
  5295. // input for next layer
  5296. inpL = cur;
  5297. }
  5298. cur = inpL;
  5299. // norm
  5300. cur = build_norm(cur,
  5301. model.output_norm,
  5302. model.output_norm_b,
  5303. LLM_NORM, -1);
  5304. cb(cur, "result_norm", -1);
  5305. res->t_embd = cur;
  5306. cur = build_lora_mm(model.output, cur);
  5307. cb(cur, "result_output", -1);
  5308. res->t_logits = cur;
  5309. ggml_build_forward_expand(gf, cur);
  5310. }
  5311. };
  5312. struct llm_build_grok : public llm_graph_context {
  5313. llm_build_grok(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5314. const int64_t n_embd_head = hparams.n_embd_head_v;
  5315. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5316. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5317. ggml_tensor * cur;
  5318. ggml_tensor * inpL;
  5319. inpL = build_inp_embd(model.tok_embd);
  5320. // multiply by embedding_multiplier_scale of 78.38367176906169
  5321. inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
  5322. // inp_pos - contains the positions
  5323. ggml_tensor * inp_pos = build_inp_pos();
  5324. auto * inp_attn = build_attn_inp_kv_unified();
  5325. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5326. for (int il = 0; il < n_layer; ++il) {
  5327. ggml_tensor * inpSA = inpL;
  5328. // norm
  5329. cur = build_norm(inpL,
  5330. model.layers[il].attn_norm, NULL,
  5331. LLM_NORM_RMS, il);
  5332. cb(cur, "attn_norm", il);
  5333. // self-attention
  5334. {
  5335. // compute Q and K and RoPE them
  5336. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5337. cb(Qcur, "Qcur", il);
  5338. if (model.layers[il].bq) {
  5339. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  5340. cb(Qcur, "Qcur", il);
  5341. }
  5342. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5343. cb(Kcur, "Kcur", il);
  5344. if (model.layers[il].bk) {
  5345. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  5346. cb(Kcur, "Kcur", il);
  5347. }
  5348. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5349. cb(Vcur, "Vcur", il);
  5350. if (model.layers[il].bv) {
  5351. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  5352. cb(Vcur, "Vcur", il);
  5353. }
  5354. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5355. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5356. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5357. Qcur = ggml_rope_ext(
  5358. ctx0, Qcur, inp_pos, nullptr,
  5359. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5360. ext_factor, attn_factor, beta_fast, beta_slow
  5361. );
  5362. Kcur = ggml_rope_ext(
  5363. ctx0, Kcur, inp_pos, nullptr,
  5364. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5365. ext_factor, attn_factor, beta_fast, beta_slow
  5366. );
  5367. cb(Qcur, "Qcur", il);
  5368. cb(Kcur, "Kcur", il);
  5369. cb(Vcur, "Vcur", il);
  5370. cur = build_attn(inp_attn,
  5371. model.layers[il].wo, model.layers[il].bo,
  5372. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
  5373. }
  5374. if (il == n_layer - 1 && inp_out_ids) {
  5375. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5376. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5377. }
  5378. // Grok
  5379. // if attn_out_norm is present then apply it before adding the input
  5380. if (model.layers[il].attn_out_norm) {
  5381. cur = build_norm(cur,
  5382. model.layers[il].attn_out_norm, NULL,
  5383. LLM_NORM_RMS, il);
  5384. cb(cur, "attn_out_norm", il);
  5385. }
  5386. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5387. cb(ffn_inp, "ffn_inp", il);
  5388. // feed-forward network
  5389. // MoE branch
  5390. cur = build_norm(ffn_inp,
  5391. model.layers[il].ffn_norm, NULL,
  5392. LLM_NORM_RMS, il);
  5393. cb(cur, "ffn_norm", il);
  5394. cur = build_moe_ffn(cur,
  5395. model.layers[il].ffn_gate_inp,
  5396. model.layers[il].ffn_up_exps,
  5397. model.layers[il].ffn_gate_exps,
  5398. model.layers[il].ffn_down_exps,
  5399. nullptr,
  5400. n_expert, n_expert_used,
  5401. LLM_FFN_GELU, true,
  5402. false, 0.0,
  5403. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  5404. il);
  5405. cb(cur, "ffn_moe_out", il);
  5406. // Grok
  5407. // if layer_out_norm is present then apply it before adding the input
  5408. // Idea: maybe ffn_out_norm is a better name
  5409. if (model.layers[il].layer_out_norm) {
  5410. cur = build_norm(cur,
  5411. model.layers[il].layer_out_norm, NULL,
  5412. LLM_NORM_RMS, il);
  5413. cb(cur, "layer_out_norm", il);
  5414. }
  5415. cur = ggml_add(ctx0, cur, ffn_inp);
  5416. cb(cur, "ffn_out", il);
  5417. cur = build_cvec(cur, il);
  5418. cb(cur, "l_out", il);
  5419. // input for next layer
  5420. inpL = cur;
  5421. }
  5422. cur = inpL;
  5423. cur = build_norm(cur,
  5424. model.output_norm, NULL,
  5425. LLM_NORM_RMS, -1);
  5426. cb(cur, "result_norm", -1);
  5427. res->t_embd = cur;
  5428. // lm_head
  5429. cur = build_lora_mm(model.output, cur);
  5430. // Grok
  5431. // multiply logits by output_multiplier_scale of 0.5773502691896257
  5432. cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
  5433. cb(cur, "result_output", -1);
  5434. res->t_logits = cur;
  5435. ggml_build_forward_expand(gf, cur);
  5436. }
  5437. };
  5438. struct llm_build_dbrx : public llm_graph_context {
  5439. llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5440. const int64_t n_embd_head = hparams.n_embd_head_v;
  5441. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  5442. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5443. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5444. ggml_tensor * cur;
  5445. ggml_tensor * inpL;
  5446. inpL = build_inp_embd(model.tok_embd);
  5447. // inp_pos - contains the positions
  5448. ggml_tensor * inp_pos = build_inp_pos();
  5449. auto * inp_attn = build_attn_inp_kv_unified();
  5450. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5451. for (int il = 0; il < n_layer; ++il) {
  5452. ggml_tensor * inpSA = inpL;
  5453. // norm
  5454. cur = build_norm(inpL,
  5455. model.layers[il].attn_norm, NULL,
  5456. LLM_NORM, il);
  5457. cb(cur, "attn_norm", il);
  5458. // self-attention
  5459. {
  5460. ggml_tensor * Qcur = nullptr;
  5461. ggml_tensor * Kcur = nullptr;
  5462. ggml_tensor * Vcur = nullptr;
  5463. cur = build_lora_mm(model.layers[il].wqkv, cur);
  5464. cb(cur, "wqkv", il);
  5465. cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  5466. cb(cur, "wqkv_clamped", il);
  5467. Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  5468. Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  5469. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  5470. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5471. Qcur = ggml_rope_ext(
  5472. ctx0, Qcur, inp_pos, nullptr,
  5473. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5474. ext_factor, attn_factor, beta_fast, beta_slow
  5475. );
  5476. Kcur = ggml_rope_ext(
  5477. ctx0, Kcur, inp_pos, nullptr,
  5478. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5479. ext_factor, attn_factor, beta_fast, beta_slow
  5480. );
  5481. cb(Qcur, "Qcur", il);
  5482. cb(Kcur, "Kcur", il);
  5483. cb(Vcur, "Vcur", il);
  5484. cur = build_attn(inp_attn,
  5485. model.layers[il].wo, NULL,
  5486. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5487. }
  5488. if (il == n_layer - 1 && inp_out_ids) {
  5489. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5490. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5491. }
  5492. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5493. cb(ffn_inp, "ffn_inp", il);
  5494. // feed-forward network
  5495. // MoE branch
  5496. cur = build_norm(ffn_inp,
  5497. model.layers[il].attn_out_norm, NULL,
  5498. LLM_NORM, il);
  5499. cb(cur, "attn_out_norm", il);
  5500. cur = build_moe_ffn(cur,
  5501. model.layers[il].ffn_gate_inp,
  5502. model.layers[il].ffn_up_exps,
  5503. model.layers[il].ffn_gate_exps,
  5504. model.layers[il].ffn_down_exps,
  5505. nullptr,
  5506. n_expert, n_expert_used,
  5507. LLM_FFN_SILU, true,
  5508. false, 0.0,
  5509. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  5510. il);
  5511. cb(cur, "ffn_moe_out", il);
  5512. cur = ggml_add(ctx0, cur, ffn_inp);
  5513. cb(cur, "ffn_out", il);
  5514. cur = build_cvec(cur, il);
  5515. cb(cur, "l_out", il);
  5516. // input for next layer
  5517. inpL = cur;
  5518. }
  5519. cur = inpL;
  5520. cur = build_norm(cur,
  5521. model.output_norm, NULL,
  5522. LLM_NORM, -1);
  5523. cb(cur, "result_norm", -1);
  5524. res->t_embd = cur;
  5525. // lm_head
  5526. cur = build_lora_mm(model.output, cur);
  5527. cb(cur, "result_output", -1);
  5528. res->t_logits = cur;
  5529. ggml_build_forward_expand(gf, cur);
  5530. }
  5531. };
  5532. struct llm_build_starcoder : public llm_graph_context {
  5533. llm_build_starcoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5534. const int64_t n_embd_head = hparams.n_embd_head_v;
  5535. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  5536. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5537. ggml_tensor * cur;
  5538. ggml_tensor * inpL;
  5539. inpL = build_inp_embd(model.tok_embd);
  5540. // inp_pos - contains the positions
  5541. ggml_tensor * inp_pos = build_inp_pos();
  5542. auto * inp_attn = build_attn_inp_kv_unified();
  5543. ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
  5544. cb(pos, "pos_embd", -1);
  5545. inpL = ggml_add(ctx0, inpL, pos);
  5546. cb(inpL, "inpL", -1);
  5547. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5548. for (int il = 0; il < n_layer; ++il) {
  5549. cur = build_norm(inpL,
  5550. model.layers[il].attn_norm,
  5551. model.layers[il].attn_norm_b,
  5552. LLM_NORM, il);
  5553. cb(cur, "attn_norm", il);
  5554. // self-attention
  5555. {
  5556. cur = build_lora_mm(model.layers[il].wqkv, cur);
  5557. cb(cur, "wqkv", il);
  5558. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  5559. cb(cur, "bqkv", il);
  5560. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  5561. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  5562. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  5563. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5564. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5565. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5566. cb(Qcur, "Qcur", il);
  5567. cb(Kcur, "Kcur", il);
  5568. cb(Vcur, "Vcur", il);
  5569. cur = build_attn(inp_attn,
  5570. model.layers[il].wo, model.layers[il].bo,
  5571. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5572. }
  5573. if (il == n_layer - 1 && inp_out_ids) {
  5574. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5575. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  5576. }
  5577. // add the input
  5578. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  5579. cb(ffn_inp, "ffn_inp", il);
  5580. // FF
  5581. {
  5582. cur = build_norm(ffn_inp,
  5583. model.layers[il].ffn_norm,
  5584. model.layers[il].ffn_norm_b,
  5585. LLM_NORM, il);
  5586. cb(cur, "ffn_norm", il);
  5587. cur = build_ffn(cur,
  5588. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  5589. NULL, NULL, NULL,
  5590. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  5591. NULL,
  5592. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  5593. cb(cur, "ffn_out", il);
  5594. }
  5595. cur = ggml_add(ctx0, cur, ffn_inp);
  5596. cur = build_cvec(cur, il);
  5597. cb(cur, "l_out", il);
  5598. // input for next layer
  5599. inpL = cur;
  5600. }
  5601. cur = build_norm(inpL,
  5602. model.output_norm,
  5603. model.output_norm_b,
  5604. LLM_NORM, -1);
  5605. cb(cur, "result_norm", -1);
  5606. res->t_embd = cur;
  5607. cur = build_lora_mm(model.output, cur);
  5608. cb(cur, "result_output", -1);
  5609. res->t_logits = cur;
  5610. ggml_build_forward_expand(gf, cur);
  5611. }
  5612. };
  5613. struct llm_build_refact : public llm_graph_context {
  5614. llm_build_refact(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5615. const int64_t n_embd_head = hparams.n_embd_head_v;
  5616. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5617. ggml_tensor * cur;
  5618. ggml_tensor * inpL;
  5619. inpL = build_inp_embd(model.tok_embd);
  5620. auto * inp_attn = build_attn_inp_kv_unified();
  5621. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5622. for (int il = 0; il < n_layer; ++il) {
  5623. ggml_tensor * inpSA = inpL;
  5624. cur = build_norm(inpL,
  5625. model.layers[il].attn_norm, NULL,
  5626. LLM_NORM_RMS, il);
  5627. cb(cur, "attn_norm", il);
  5628. // self-attention
  5629. {
  5630. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  5631. cb(Qcur, "Qcur", il);
  5632. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  5633. cb(Kcur, "Kcur", il);
  5634. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  5635. cb(Vcur, "Vcur", il);
  5636. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5637. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5638. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5639. cb(Qcur, "Qcur", il);
  5640. cb(Kcur, "Kcur", il);
  5641. cb(Vcur, "Vcur", il);
  5642. cur = build_attn(inp_attn,
  5643. model.layers[il].wo, NULL,
  5644. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5645. }
  5646. if (il == n_layer - 1 && inp_out_ids) {
  5647. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5648. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5649. }
  5650. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5651. cb(ffn_inp, "ffn_inp", il);
  5652. // feed-forward network
  5653. {
  5654. cur = build_norm(ffn_inp,
  5655. model.layers[il].ffn_norm, NULL,
  5656. LLM_NORM_RMS, il);
  5657. cb(cur, "ffn_norm", il);
  5658. cur = build_ffn(cur,
  5659. model.layers[il].ffn_up, NULL, NULL,
  5660. model.layers[il].ffn_gate, NULL, NULL,
  5661. model.layers[il].ffn_down, NULL, NULL,
  5662. NULL,
  5663. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5664. cb(cur, "ffn_out", il);
  5665. }
  5666. cur = ggml_add(ctx0, cur, ffn_inp);
  5667. cur = build_cvec(cur, il);
  5668. cb(cur, "l_out", il);
  5669. // input for next layer
  5670. inpL = cur;
  5671. }
  5672. cur = inpL;
  5673. cur = build_norm(cur,
  5674. model.output_norm, NULL,
  5675. LLM_NORM_RMS, -1);
  5676. cb(cur, "result_norm", -1);
  5677. res->t_embd = cur;
  5678. // lm_head
  5679. cur = build_lora_mm(model.output, cur);
  5680. cb(cur, "result_output", -1);
  5681. res->t_logits = cur;
  5682. ggml_build_forward_expand(gf, cur);
  5683. }
  5684. };
  5685. struct llm_build_bert : public llm_graph_context {
  5686. llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5687. const int64_t n_embd_head = hparams.n_embd_head_v;
  5688. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  5689. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5690. ggml_tensor * cur;
  5691. ggml_tensor * inpL;
  5692. ggml_tensor * inp_pos = nullptr;
  5693. if (model.arch != LLM_ARCH_JINA_BERT_V2) {
  5694. inp_pos = build_inp_pos();
  5695. }
  5696. // construct input embeddings (token, type, position)
  5697. inpL = build_inp_embd(model.tok_embd);
  5698. // token types are hardcoded to zero ("Sentence A")
  5699. if (model.type_embd) {
  5700. ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
  5701. inpL = ggml_add(ctx0, inpL, type_row0);
  5702. }
  5703. if (model.arch == LLM_ARCH_BERT) {
  5704. inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
  5705. }
  5706. cb(inpL, "inp_embd", -1);
  5707. // embed layer norm
  5708. inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
  5709. cb(inpL, "inp_norm", -1);
  5710. auto * inp_attn = build_attn_inp_no_cache();
  5711. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5712. for (int il = 0; il < n_layer; ++il) {
  5713. ggml_tensor * cur = inpL;
  5714. {
  5715. ggml_tensor * Qcur;
  5716. ggml_tensor * Kcur;
  5717. ggml_tensor * Vcur;
  5718. // self-attention
  5719. if (model.layers[il].wqkv) {
  5720. cur = build_lora_mm(model.layers[il].wqkv, cur);
  5721. cb(cur, "wqkv", il);
  5722. if (model.layers[il].bqkv) {
  5723. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  5724. cb(cur, "bqkv", il);
  5725. }
  5726. Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  5727. Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  5728. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  5729. } else {
  5730. Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
  5731. Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
  5732. Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
  5733. }
  5734. if (model.layers[il].attn_q_norm) {
  5735. Qcur = build_norm(Qcur,
  5736. model.layers[il].attn_q_norm,
  5737. model.layers[il].attn_q_norm_b,
  5738. LLM_NORM, il);
  5739. }
  5740. if (model.layers[il].attn_k_norm) {
  5741. Kcur = build_norm(Kcur,
  5742. model.layers[il].attn_k_norm,
  5743. model.layers[il].attn_k_norm_b,
  5744. LLM_NORM, il);
  5745. }
  5746. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5747. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5748. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5749. // RoPE
  5750. if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
  5751. Qcur = ggml_rope_ext(
  5752. ctx0, Qcur, inp_pos, nullptr,
  5753. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5754. ext_factor, attn_factor, beta_fast, beta_slow
  5755. );
  5756. Kcur = ggml_rope_ext(
  5757. ctx0, Kcur, inp_pos, nullptr,
  5758. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5759. ext_factor, attn_factor, beta_fast, beta_slow
  5760. );
  5761. }
  5762. cb(Qcur, "Qcur", il);
  5763. cb(Kcur, "Kcur", il);
  5764. cb(Vcur, "Vcur", il);
  5765. cur = build_attn(inp_attn,
  5766. model.layers[il].wo, model.layers[il].bo,
  5767. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5768. cb(cur, "kqv_out", il);
  5769. }
  5770. if (il == n_layer - 1 && inp_out_ids) {
  5771. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5772. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  5773. }
  5774. // re-add the layer input
  5775. cur = ggml_add(ctx0, cur, inpL);
  5776. // attention layer norm
  5777. cur = build_norm(cur, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, il);
  5778. if (model.layers[il].attn_norm_2 != nullptr) {
  5779. cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
  5780. cur = build_norm(cur, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, il);
  5781. }
  5782. ggml_tensor * ffn_inp = cur;
  5783. cb(ffn_inp, "ffn_inp", il);
  5784. // feed-forward network
  5785. if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
  5786. // MoE branch
  5787. cur = build_moe_ffn(cur,
  5788. model.layers[il].ffn_gate_inp,
  5789. model.layers[il].ffn_up_exps,
  5790. nullptr,
  5791. model.layers[il].ffn_down_exps,
  5792. nullptr,
  5793. hparams.n_expert,
  5794. hparams.n_expert_used,
  5795. LLM_FFN_GELU,
  5796. false, false,
  5797. 0.0f,
  5798. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
  5799. cb(cur, "ffn_moe_out", il);
  5800. } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
  5801. cur = build_ffn(cur,
  5802. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  5803. NULL, NULL, NULL,
  5804. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  5805. NULL,
  5806. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  5807. cb(cur, "ffn_out", il);
  5808. } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
  5809. cur = build_ffn(cur,
  5810. model.layers[il].ffn_up, NULL, NULL,
  5811. model.layers[il].ffn_gate, NULL, NULL,
  5812. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  5813. NULL,
  5814. model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
  5815. cb(cur, "ffn_out", il);
  5816. } else {
  5817. cur = build_ffn(cur,
  5818. model.layers[il].ffn_up, NULL, NULL,
  5819. model.layers[il].ffn_gate, NULL, NULL,
  5820. model.layers[il].ffn_down, NULL, NULL,
  5821. NULL,
  5822. LLM_FFN_SILU, LLM_FFN_PAR, il);
  5823. cb(cur, "ffn_out", il);
  5824. }
  5825. // attentions bypass the intermediate layer
  5826. cur = ggml_add(ctx0, cur, ffn_inp);
  5827. // output layer norm
  5828. cur = build_norm(cur, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, il);
  5829. // input for next layer
  5830. inpL = cur;
  5831. }
  5832. cur = inpL;
  5833. cb(cur, "result_embd", -1);
  5834. res->t_embd = cur;
  5835. ggml_build_forward_expand(gf, cur);
  5836. }
  5837. };
  5838. struct llm_build_neo_bert : public llm_graph_context {
  5839. llm_build_neo_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5840. const int64_t n_embd_head = hparams.n_embd_head_v;
  5841. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  5842. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5843. ggml_tensor * cur;
  5844. ggml_tensor * inpL;
  5845. ggml_tensor * inp_pos = build_inp_pos();
  5846. // construct input embeddings (token, type, position)
  5847. inpL = build_inp_embd(model.tok_embd);
  5848. cb(inpL, "inp_embd", -1);
  5849. auto * inp_attn = build_attn_inp_no_cache();
  5850. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5851. for (int il = 0; il < n_layer; ++il) {
  5852. ggml_tensor * cur = inpL;
  5853. // pre-norm
  5854. cur = build_norm(inpL,
  5855. model.layers[il].attn_norm, NULL,
  5856. LLM_NORM_RMS, il);
  5857. {
  5858. ggml_tensor * Qcur;
  5859. ggml_tensor * Kcur;
  5860. ggml_tensor * Vcur;
  5861. // self-attention
  5862. cur = build_lora_mm(model.layers[il].wqkv, cur);
  5863. cb(cur, "wqkv", il);
  5864. Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  5865. Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  5866. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  5867. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5868. // RoPE
  5869. Qcur = ggml_rope_ext(
  5870. ctx0, Qcur, inp_pos, nullptr,
  5871. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5872. ext_factor, attn_factor, beta_fast, beta_slow
  5873. );
  5874. Kcur = ggml_rope_ext(
  5875. ctx0, Kcur, inp_pos, nullptr,
  5876. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5877. ext_factor, attn_factor, beta_fast, beta_slow
  5878. );
  5879. cb(Qcur, "Qcur", il);
  5880. cb(Kcur, "Kcur", il);
  5881. cb(Vcur, "Vcur", il);
  5882. cur = build_attn(inp_attn,
  5883. model.layers[il].wo, nullptr,
  5884. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5885. cb(cur, "kqv_out", il);
  5886. }
  5887. if (il == n_layer - 1 && inp_out_ids) {
  5888. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5889. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  5890. }
  5891. // re-add the layer input
  5892. cur = ggml_add(ctx0, cur, inpL);
  5893. ggml_tensor * ffn_inp = cur;
  5894. cb(ffn_inp, "ffn_inp", il);
  5895. // pre-norm
  5896. cur = build_norm(ffn_inp,
  5897. model.layers[il].ffn_norm, NULL,
  5898. LLM_NORM_RMS, il);
  5899. cb(cur, "ffn_norm", il);
  5900. // feed-forward network
  5901. cur = build_ffn(cur,
  5902. model.layers[il].ffn_up,
  5903. NULL, NULL, NULL, NULL, NULL,
  5904. model.layers[il].ffn_down,
  5905. NULL, NULL, NULL,
  5906. LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
  5907. // attentions bypass the intermediate layer
  5908. cur = ggml_add(ctx0, cur, ffn_inp);
  5909. // input for next layer
  5910. inpL = cur;
  5911. }
  5912. cur = inpL;
  5913. cur = build_norm(cur,
  5914. model.output_norm_enc, NULL,
  5915. LLM_NORM_RMS, -1);
  5916. cb(cur, "result_embd", -1);
  5917. res->t_embd = cur;
  5918. ggml_build_forward_expand(gf, cur);
  5919. }
  5920. };
  5921. struct llm_build_bloom : public llm_graph_context {
  5922. llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  5923. const int64_t n_embd_head = hparams.n_embd_head_v;
  5924. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  5925. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5926. ggml_tensor * cur;
  5927. ggml_tensor * inpL;
  5928. inpL = build_inp_embd(model.tok_embd);
  5929. auto * inp_attn = build_attn_inp_kv_unified();
  5930. inpL = build_norm(inpL,
  5931. model.tok_norm,
  5932. model.tok_norm_b,
  5933. LLM_NORM, -1);
  5934. cb(inpL, "inp_norm", -1);
  5935. ggml_tensor * inp_out_ids = build_inp_out_ids();
  5936. for (int il = 0; il < n_layer; ++il) {
  5937. cur = build_norm(inpL,
  5938. model.layers[il].attn_norm,
  5939. model.layers[il].attn_norm_b,
  5940. LLM_NORM, il);
  5941. cb(cur, "attn_norm", il);
  5942. // self-attention
  5943. {
  5944. cur = build_lora_mm(model.layers[il].wqkv, cur);
  5945. cb(cur, "wqkv", il);
  5946. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  5947. cb(cur, "bqkv", il);
  5948. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  5949. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  5950. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  5951. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5952. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5953. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  5954. cb(Qcur, "Qcur", il);
  5955. cb(Kcur, "Kcur", il);
  5956. cb(Vcur, "Vcur", il);
  5957. cur = build_attn(inp_attn,
  5958. model.layers[il].wo, model.layers[il].bo,
  5959. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  5960. }
  5961. if (il == n_layer - 1 && inp_out_ids) {
  5962. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5963. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  5964. }
  5965. // Add the input
  5966. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  5967. cb(ffn_inp, "ffn_inp", il);
  5968. // FF
  5969. {
  5970. cur = build_norm(ffn_inp,
  5971. model.layers[il].ffn_norm,
  5972. model.layers[il].ffn_norm_b,
  5973. LLM_NORM, il);
  5974. cb(cur, "ffn_norm", il);
  5975. cur = build_ffn(cur,
  5976. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  5977. NULL, NULL, NULL,
  5978. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  5979. NULL,
  5980. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  5981. cb(cur, "ffn_out", il);
  5982. }
  5983. cur = ggml_add(ctx0, cur, ffn_inp);
  5984. cur = build_cvec(cur, il);
  5985. cb(cur, "l_out", il);
  5986. // input for next layer
  5987. inpL = cur;
  5988. }
  5989. cur = build_norm(inpL,
  5990. model.output_norm,
  5991. model.output_norm_b,
  5992. LLM_NORM, -1);
  5993. cb(cur, "result_norm", -1);
  5994. res->t_embd = cur;
  5995. cur = build_lora_mm(model.output, cur);
  5996. cb(cur, "result_output", -1);
  5997. res->t_logits = cur;
  5998. ggml_build_forward_expand(gf, cur);
  5999. }
  6000. };
  6001. struct llm_build_mpt : public llm_graph_context {
  6002. llm_build_mpt(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6003. const int64_t n_embd_head = hparams.n_embd_head_v;
  6004. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  6005. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6006. ggml_tensor * cur;
  6007. ggml_tensor * pos;
  6008. ggml_tensor * inpL;
  6009. inpL = build_inp_embd(model.tok_embd);
  6010. auto * inp_attn = build_attn_inp_kv_unified();
  6011. if (model.pos_embd) {
  6012. // inp_pos - contains the positions
  6013. ggml_tensor * inp_pos = build_inp_pos();
  6014. pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
  6015. cb(pos, "pos_embd", -1);
  6016. inpL = ggml_add(ctx0, inpL, pos);
  6017. cb(inpL, "inpL", -1);
  6018. }
  6019. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6020. for (int il = 0; il < n_layer; ++il) {
  6021. ggml_tensor * attn_norm;
  6022. attn_norm = build_norm(inpL,
  6023. model.layers[il].attn_norm,
  6024. model.layers[il].attn_norm_b,
  6025. LLM_NORM, il);
  6026. cb(attn_norm, "attn_norm", il);
  6027. // self-attention
  6028. {
  6029. cur = attn_norm;
  6030. cur = build_lora_mm(model.layers[il].wqkv, cur);
  6031. cb(cur, "wqkv", il);
  6032. if (model.layers[il].bqkv){
  6033. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  6034. cb(cur, "bqkv", il);
  6035. }
  6036. if (hparams.f_clamp_kqv > 0.0f) {
  6037. cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  6038. cb(cur, "wqkv_clamped", il);
  6039. }
  6040. ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
  6041. ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
  6042. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  6043. cb(Qcur, "Qcur", il);
  6044. cb(Kcur, "Kcur", il);
  6045. cb(Vcur, "Vcur", il);
  6046. // Q/K Layernorm
  6047. if (model.layers[il].attn_q_norm) {
  6048. Qcur = build_norm(Qcur,
  6049. model.layers[il].attn_q_norm,
  6050. model.layers[il].attn_q_norm_b,
  6051. LLM_NORM, il);
  6052. cb(Qcur, "Qcur", il);
  6053. Kcur = build_norm(Kcur,
  6054. model.layers[il].attn_k_norm,
  6055. model.layers[il].attn_k_norm_b,
  6056. LLM_NORM, il);
  6057. cb(Kcur, "Kcur", il);
  6058. } else {
  6059. Qcur = ggml_cont(ctx0, Qcur);
  6060. cb(Qcur, "Qcur", il);
  6061. Kcur = ggml_cont(ctx0, Kcur);
  6062. cb(Kcur, "Kcur", il);
  6063. }
  6064. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6065. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6066. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6067. cb(Qcur, "Qcur", il);
  6068. cb(Kcur, "Kcur", il);
  6069. cb(Vcur, "Vcur", il);
  6070. cur = build_attn(inp_attn,
  6071. model.layers[il].wo, model.layers[il].bo,
  6072. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6073. }
  6074. if (il == n_layer - 1 && inp_out_ids) {
  6075. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6076. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6077. }
  6078. // Add the input
  6079. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  6080. cb(ffn_inp, "ffn_inp", il);
  6081. // feed forward
  6082. {
  6083. cur = build_norm(ffn_inp,
  6084. model.layers[il].ffn_norm,
  6085. model.layers[il].ffn_norm_b,
  6086. LLM_NORM, il);
  6087. cb(cur, "ffn_norm", il);
  6088. cur = build_ffn(cur,
  6089. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  6090. NULL, NULL, NULL,
  6091. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  6092. model.layers[il].ffn_act,
  6093. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  6094. cb(cur, "ffn_out", il);
  6095. }
  6096. cur = ggml_add(ctx0, cur, ffn_inp);
  6097. cur = build_cvec(cur, il);
  6098. cb(cur, "l_out", il);
  6099. // input for next layer
  6100. inpL = cur;
  6101. }
  6102. cur = inpL;
  6103. cur = build_norm(cur,
  6104. model.output_norm,
  6105. model.output_norm_b,
  6106. LLM_NORM, -1);
  6107. cb(cur, "result_norm", -1);
  6108. res->t_embd = cur;
  6109. cur = build_lora_mm(model.output, cur);
  6110. cb(cur, "result_output", -1);
  6111. res->t_logits = cur;
  6112. ggml_build_forward_expand(gf, cur);
  6113. }
  6114. };
  6115. struct llm_build_stablelm : public llm_graph_context {
  6116. llm_build_stablelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6117. const int64_t n_embd_head = hparams.n_embd_head_v;
  6118. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6119. ggml_tensor * cur;
  6120. ggml_tensor * inpL;
  6121. inpL = build_inp_embd(model.tok_embd);
  6122. // inp_pos - contains the positions
  6123. ggml_tensor * inp_pos = build_inp_pos();
  6124. auto * inp_attn = build_attn_inp_kv_unified();
  6125. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6126. for (int il = 0; il < n_layer; ++il) {
  6127. // norm
  6128. cur = build_norm(inpL,
  6129. model.layers[il].attn_norm,
  6130. model.layers[il].attn_norm_b,
  6131. LLM_NORM, il);
  6132. cb(cur, "attn_norm", il);
  6133. ggml_tensor * inpSA = cur;
  6134. // self-attention
  6135. {
  6136. // compute Q and K and RoPE them
  6137. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6138. cb(Qcur, "Qcur", il);
  6139. if (model.layers[il].bq) {
  6140. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  6141. cb(Qcur, "Qcur", il);
  6142. }
  6143. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6144. cb(Kcur, "Kcur", il);
  6145. if (model.layers[il].bk) {
  6146. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  6147. cb(Kcur, "Kcur", il);
  6148. }
  6149. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6150. cb(Vcur, "Vcur", il);
  6151. if (model.layers[il].bv) {
  6152. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  6153. cb(Vcur, "Vcur", il);
  6154. }
  6155. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6156. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6157. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6158. if (model.layers[il].attn_q_norm) {
  6159. Qcur = build_norm(Qcur,
  6160. model.layers[il].attn_q_norm,
  6161. NULL,
  6162. LLM_NORM, il);
  6163. cb(Qcur, "Qcur", il);
  6164. }
  6165. if (model.layers[il].attn_k_norm) {
  6166. Kcur = build_norm(Kcur,
  6167. model.layers[il].attn_k_norm,
  6168. NULL,
  6169. LLM_NORM, il);
  6170. cb(Kcur, "Kcur", il);
  6171. }
  6172. Qcur = ggml_rope_ext(
  6173. ctx0, Qcur, inp_pos, nullptr,
  6174. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6175. ext_factor, attn_factor, beta_fast, beta_slow
  6176. );
  6177. Kcur = ggml_rope_ext(
  6178. ctx0, Kcur, inp_pos, nullptr,
  6179. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6180. ext_factor, attn_factor, beta_fast, beta_slow
  6181. );
  6182. cb(Qcur, "Qcur", il);
  6183. cb(Kcur, "Kcur", il);
  6184. cb(Vcur, "Vcur", il);
  6185. cur = build_attn(inp_attn,
  6186. model.layers[il].wo, NULL,
  6187. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6188. }
  6189. if (il == n_layer - 1 && inp_out_ids) {
  6190. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6191. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6192. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6193. }
  6194. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  6195. cb(ffn_inp, "ffn_inp", il);
  6196. // feed-forward network
  6197. {
  6198. if (model.layers[il].ffn_norm) {
  6199. cur = build_norm(ffn_inp,
  6200. model.layers[il].ffn_norm,
  6201. model.layers[il].ffn_norm_b,
  6202. LLM_NORM, il);
  6203. cb(cur, "ffn_norm", il);
  6204. } else {
  6205. // parallel residual
  6206. cur = inpSA;
  6207. }
  6208. cur = build_ffn(cur,
  6209. model.layers[il].ffn_up, NULL, NULL,
  6210. model.layers[il].ffn_gate, NULL, NULL,
  6211. model.layers[il].ffn_down, NULL, NULL,
  6212. NULL,
  6213. LLM_FFN_SILU, LLM_FFN_PAR, il);
  6214. cb(cur, "ffn_out", il);
  6215. }
  6216. cur = ggml_add(ctx0, cur, ffn_inp);
  6217. cur = build_cvec(cur, il);
  6218. cb(cur, "l_out", il);
  6219. // input for next layer
  6220. inpL = cur;
  6221. }
  6222. cur = inpL;
  6223. cur = build_norm(cur,
  6224. model.output_norm,
  6225. model.output_norm_b,
  6226. LLM_NORM, -1);
  6227. cb(cur, "result_norm", -1);
  6228. res->t_embd = cur;
  6229. // lm_head
  6230. cur = build_lora_mm(model.output, cur);
  6231. cb(cur, "result_output", -1);
  6232. res->t_logits = cur;
  6233. ggml_build_forward_expand(gf, cur);
  6234. }
  6235. };
  6236. struct llm_build_qwen : public llm_graph_context {
  6237. llm_build_qwen(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6238. const int64_t n_embd_head = hparams.n_embd_head_v;
  6239. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6240. ggml_tensor * cur;
  6241. ggml_tensor * inpL;
  6242. inpL = build_inp_embd(model.tok_embd);
  6243. // inp_pos - contains the positions
  6244. ggml_tensor * inp_pos = build_inp_pos();
  6245. auto * inp_attn = build_attn_inp_kv_unified();
  6246. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6247. for (int il = 0; il < n_layer; ++il) {
  6248. ggml_tensor * inpSA = inpL;
  6249. cur = build_norm(inpL,
  6250. model.layers[il].attn_norm, NULL,
  6251. LLM_NORM_RMS, il);
  6252. cb(cur, "attn_norm", il);
  6253. // self-attention
  6254. {
  6255. cur = build_lora_mm(model.layers[il].wqkv, cur);
  6256. cb(cur, "wqkv", il);
  6257. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  6258. cb(cur, "bqkv", il);
  6259. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  6260. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  6261. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
  6262. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6263. // using mode = 2 for neox mode
  6264. Qcur = ggml_rope_ext(
  6265. ctx0, Qcur, inp_pos, nullptr,
  6266. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6267. ext_factor, attn_factor, beta_fast, beta_slow
  6268. );
  6269. Kcur = ggml_rope_ext(
  6270. ctx0, Kcur, inp_pos, nullptr,
  6271. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6272. ext_factor, attn_factor, beta_fast, beta_slow
  6273. );
  6274. cb(Qcur, "Qcur", il);
  6275. cb(Kcur, "Kcur", il);
  6276. cb(Vcur, "Vcur", il);
  6277. cur = build_attn(inp_attn,
  6278. model.layers[il].wo, NULL,
  6279. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6280. }
  6281. if (il == n_layer - 1 && inp_out_ids) {
  6282. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6283. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6284. }
  6285. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6286. cb(ffn_inp, "ffn_inp", il);
  6287. // feed-forward forward
  6288. {
  6289. cur = build_norm(ffn_inp,
  6290. model.layers[il].ffn_norm, NULL,
  6291. LLM_NORM_RMS, il);
  6292. cb(cur, "ffn_norm", il);
  6293. cur = build_ffn(cur,
  6294. model.layers[il].ffn_up, NULL, NULL,
  6295. model.layers[il].ffn_gate, NULL, NULL,
  6296. model.layers[il].ffn_down, NULL, NULL,
  6297. NULL,
  6298. LLM_FFN_SILU, LLM_FFN_PAR, il);
  6299. cb(cur, "ffn_out", il);
  6300. }
  6301. cur = ggml_add(ctx0, cur, ffn_inp);
  6302. cur = build_cvec(cur, il);
  6303. cb(cur, "l_out", il);
  6304. // input for next layer
  6305. inpL = cur;
  6306. }
  6307. cur = inpL;
  6308. cur = build_norm(cur,
  6309. model.output_norm, NULL,
  6310. LLM_NORM_RMS, -1);
  6311. cb(cur, "result_norm", -1);
  6312. res->t_embd = cur;
  6313. // lm_head
  6314. cur = build_lora_mm(model.output, cur);
  6315. cb(cur, "result_output", -1);
  6316. res->t_logits = cur;
  6317. ggml_build_forward_expand(gf, cur);
  6318. }
  6319. };
  6320. struct llm_build_qwen2 : public llm_graph_context {
  6321. llm_build_qwen2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6322. const int64_t n_embd_head = hparams.n_embd_head_v;
  6323. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6324. GGML_ASSERT(n_embd_head == hparams.n_rot);
  6325. ggml_tensor * cur;
  6326. ggml_tensor * inpL;
  6327. inpL = build_inp_embd(model.tok_embd);
  6328. // inp_pos - contains the positions
  6329. ggml_tensor * inp_pos = build_inp_pos();
  6330. auto * inp_attn = build_attn_inp_kv_unified();
  6331. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6332. for (int il = 0; il < n_layer; ++il) {
  6333. ggml_tensor * inpSA = inpL;
  6334. // norm
  6335. cur = build_norm(inpL,
  6336. model.layers[il].attn_norm, NULL,
  6337. LLM_NORM_RMS, il);
  6338. cb(cur, "attn_norm", il);
  6339. // self-attention
  6340. {
  6341. // compute Q and K and RoPE them
  6342. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6343. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  6344. cb(Qcur, "Qcur", il);
  6345. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6346. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  6347. cb(Kcur, "Kcur", il);
  6348. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6349. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  6350. cb(Vcur, "Vcur", il);
  6351. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6352. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6353. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6354. Qcur = ggml_rope_ext(
  6355. ctx0, Qcur, inp_pos, nullptr,
  6356. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6357. ext_factor, attn_factor, beta_fast, beta_slow
  6358. );
  6359. Kcur = ggml_rope_ext(
  6360. ctx0, Kcur, inp_pos, nullptr,
  6361. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6362. ext_factor, attn_factor, beta_fast, beta_slow
  6363. );
  6364. cb(Qcur, "Qcur", il);
  6365. cb(Kcur, "Kcur", il);
  6366. cb(Vcur, "Vcur", il);
  6367. cur = build_attn(inp_attn,
  6368. model.layers[il].wo, model.layers[il].bo,
  6369. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6370. }
  6371. if (il == n_layer - 1 && inp_out_ids) {
  6372. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6373. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6374. }
  6375. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6376. cb(ffn_inp, "ffn_inp", il);
  6377. // feed-forward network
  6378. cur = build_norm(ffn_inp,
  6379. model.layers[il].ffn_norm, NULL,
  6380. LLM_NORM_RMS, il);
  6381. cb(cur, "ffn_norm", il);
  6382. cur = build_ffn(cur,
  6383. model.layers[il].ffn_up, NULL, NULL,
  6384. model.layers[il].ffn_gate, NULL, NULL,
  6385. model.layers[il].ffn_down, NULL, NULL,
  6386. NULL,
  6387. LLM_FFN_SILU, LLM_FFN_PAR, il);
  6388. cb(cur, "ffn_out", il);
  6389. cur = ggml_add(ctx0, cur, ffn_inp);
  6390. cur = build_cvec(cur, il);
  6391. cb(cur, "l_out", il);
  6392. // input for next layer
  6393. inpL = cur;
  6394. }
  6395. cur = inpL;
  6396. cur = build_norm(cur,
  6397. model.output_norm, NULL,
  6398. LLM_NORM_RMS, -1);
  6399. cb(cur, "result_norm", -1);
  6400. res->t_embd = cur;
  6401. // lm_head
  6402. cur = build_lora_mm(model.output, cur);
  6403. if (model.output_b != nullptr) {
  6404. cur = ggml_add(ctx0, cur, model.output_b);
  6405. }
  6406. cb(cur, "result_output", -1);
  6407. res->t_logits = cur;
  6408. ggml_build_forward_expand(gf, cur);
  6409. }
  6410. };
  6411. struct llm_build_dream : public llm_graph_context {
  6412. llm_build_dream(const llama_model & model, const llm_graph_params & params) :
  6413. llm_graph_context(params) {
  6414. //copied from qwen2
  6415. const int64_t n_embd_head = hparams.n_embd_head_v;
  6416. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6417. GGML_ASSERT(n_embd_head == hparams.n_rot);
  6418. ggml_tensor * cur;
  6419. ggml_tensor * inpL;
  6420. inpL = build_inp_embd(model.tok_embd);
  6421. // inp_pos - contains the positions
  6422. ggml_tensor * inp_pos = build_inp_pos();
  6423. auto * inp_attn = build_attn_inp_no_cache();
  6424. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6425. for (int il = 0; il < n_layer; ++il) {
  6426. ggml_tensor * inpSA = inpL;
  6427. // norm
  6428. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  6429. cb(cur, "attn_norm", il);
  6430. // self-attention
  6431. {
  6432. // compute Q and K and RoPE them
  6433. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6434. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  6435. cb(Qcur, "Qcur", il);
  6436. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6437. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  6438. cb(Kcur, "Kcur", il);
  6439. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6440. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  6441. cb(Vcur, "Vcur", il);
  6442. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6443. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6444. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6445. Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6446. ext_factor, attn_factor, beta_fast, beta_slow);
  6447. Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6448. ext_factor, attn_factor, beta_fast, beta_slow);
  6449. cb(Qcur, "Qcur", il);
  6450. cb(Kcur, "Kcur", il);
  6451. cb(Vcur, "Vcur", il);
  6452. cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr,
  6453. nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
  6454. }
  6455. if (il == n_layer - 1 && inp_out_ids) {
  6456. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6457. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6458. }
  6459. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6460. cb(ffn_inp, "ffn_inp", il);
  6461. // feed-forward network
  6462. cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
  6463. cb(cur, "ffn_norm", il);
  6464. cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
  6465. model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
  6466. cb(cur, "ffn_out", il);
  6467. cur = ggml_add(ctx0, cur, ffn_inp);
  6468. cur = build_cvec(cur, il);
  6469. cb(cur, "l_out", il);
  6470. // input for next layer
  6471. inpL = cur;
  6472. }
  6473. cur = inpL;
  6474. cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
  6475. cb(cur, "result_norm", -1);
  6476. res->t_embd = cur;
  6477. // lm_head
  6478. cur = build_lora_mm(model.output, cur);
  6479. cb(cur, "result_output", -1);
  6480. res->t_logits = cur;
  6481. ggml_build_forward_expand(gf, cur);
  6482. }
  6483. };
  6484. struct llm_build_qwen2vl : public llm_graph_context {
  6485. llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6486. const int64_t n_embd_head = hparams.n_embd_head_v;
  6487. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6488. GGML_ASSERT(n_embd_head == hparams.n_rot);
  6489. ggml_tensor * cur;
  6490. ggml_tensor * inpL;
  6491. inpL = build_inp_embd(model.tok_embd);
  6492. // inp_pos - contains the positions
  6493. ggml_tensor * inp_pos = build_inp_pos();
  6494. auto * inp_attn = build_attn_inp_kv_unified();
  6495. int sections[4];
  6496. std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
  6497. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6498. for (int il = 0; il < n_layer; ++il) {
  6499. ggml_tensor * inpSA = inpL;
  6500. // norm
  6501. cur = build_norm(inpL,
  6502. model.layers[il].attn_norm, NULL,
  6503. LLM_NORM_RMS, il);
  6504. cb(cur, "attn_norm", il);
  6505. // self-attention
  6506. {
  6507. // compute Q and K and RoPE them
  6508. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6509. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  6510. cb(Qcur, "Qcur", il);
  6511. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6512. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  6513. cb(Kcur, "Kcur", il);
  6514. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6515. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  6516. cb(Vcur, "Vcur", il);
  6517. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6518. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6519. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6520. Qcur = ggml_rope_multi(
  6521. ctx0, Qcur, inp_pos, nullptr,
  6522. n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
  6523. ext_factor, attn_factor, beta_fast, beta_slow
  6524. );
  6525. Kcur = ggml_rope_multi(
  6526. ctx0, Kcur, inp_pos, nullptr,
  6527. n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
  6528. ext_factor, attn_factor, beta_fast, beta_slow
  6529. );
  6530. cb(Qcur, "Qcur", il);
  6531. cb(Kcur, "Kcur", il);
  6532. cb(Vcur, "Vcur", il);
  6533. cur = build_attn(inp_attn,
  6534. model.layers[il].wo, model.layers[il].bo,
  6535. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6536. }
  6537. if (il == n_layer - 1 && inp_out_ids) {
  6538. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6539. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6540. }
  6541. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6542. cb(ffn_inp, "ffn_inp", il);
  6543. // feed-forward network
  6544. cur = build_norm(ffn_inp,
  6545. model.layers[il].ffn_norm, NULL,
  6546. LLM_NORM_RMS, il);
  6547. cb(cur, "ffn_norm", il);
  6548. cur = build_ffn(cur,
  6549. model.layers[il].ffn_up, NULL, NULL,
  6550. model.layers[il].ffn_gate, NULL, NULL,
  6551. model.layers[il].ffn_down, NULL, NULL,
  6552. NULL,
  6553. LLM_FFN_SILU, LLM_FFN_PAR, il);
  6554. cb(cur, "ffn_out", il);
  6555. cur = ggml_add(ctx0, cur, ffn_inp);
  6556. cur = build_cvec(cur, il);
  6557. cb(cur, "l_out", il);
  6558. // input for next layer
  6559. inpL = cur;
  6560. }
  6561. cur = inpL;
  6562. cur = build_norm(cur,
  6563. model.output_norm, NULL,
  6564. LLM_NORM_RMS, -1);
  6565. cb(cur, "result_norm", -1);
  6566. res->t_embd = cur;
  6567. // lm_head
  6568. cur = build_lora_mm(model.output, cur);
  6569. cb(cur, "result_output", -1);
  6570. res->t_logits = cur;
  6571. ggml_build_forward_expand(gf, cur);
  6572. }
  6573. };
  6574. struct llm_build_qwen2moe : public llm_graph_context {
  6575. llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6576. const int64_t n_embd_head = hparams.n_embd_head_v;
  6577. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6578. GGML_ASSERT(n_embd_head == hparams.n_rot);
  6579. ggml_tensor * cur;
  6580. ggml_tensor * inpL;
  6581. inpL = build_inp_embd(model.tok_embd);
  6582. // inp_pos - contains the positions
  6583. ggml_tensor * inp_pos = build_inp_pos();
  6584. auto * inp_attn = build_attn_inp_kv_unified();
  6585. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6586. for (int il = 0; il < n_layer; ++il) {
  6587. ggml_tensor * inpSA = inpL;
  6588. // norm
  6589. cur = build_norm(inpL,
  6590. model.layers[il].attn_norm, NULL,
  6591. LLM_NORM_RMS, il);
  6592. cb(cur, "attn_norm", il);
  6593. // self_attention
  6594. {
  6595. // compute Q and K and RoPE them
  6596. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6597. cb(Qcur, "Qcur", il);
  6598. if (model.layers[il].bq) {
  6599. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  6600. cb(Qcur, "Qcur", il);
  6601. }
  6602. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6603. cb(Kcur, "Kcur", il);
  6604. if (model.layers[il].bk) {
  6605. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  6606. cb(Kcur, "Kcur", il);
  6607. }
  6608. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6609. cb(Vcur, "Vcur", il);
  6610. if (model.layers[il].bv) {
  6611. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  6612. cb(Vcur, "Vcur", il);
  6613. }
  6614. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6615. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6616. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6617. Qcur = ggml_rope_ext(
  6618. ctx0, Qcur, inp_pos, nullptr,
  6619. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6620. ext_factor, attn_factor, beta_fast, beta_slow
  6621. );
  6622. Kcur = ggml_rope_ext(
  6623. ctx0, Kcur, inp_pos, nullptr,
  6624. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6625. ext_factor, attn_factor, beta_fast, beta_slow
  6626. );
  6627. cb(Qcur, "Qcur", il);
  6628. cb(Kcur, "Kcur", il);
  6629. cb(Vcur, "Vcur", il);
  6630. cur = build_attn(inp_attn,
  6631. model.layers[il].wo, model.layers[il].bo,
  6632. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6633. }
  6634. if (il == n_layer - 1 && inp_out_ids) {
  6635. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6636. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6637. }
  6638. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6639. cb(ffn_inp, "ffn_inp", il);
  6640. // MoE branch
  6641. cur = build_norm(ffn_inp,
  6642. model.layers[il].ffn_norm, NULL,
  6643. LLM_NORM_RMS, il);
  6644. cb(cur, "ffn_norm", il);
  6645. ggml_tensor * moe_out =
  6646. build_moe_ffn(cur,
  6647. model.layers[il].ffn_gate_inp,
  6648. model.layers[il].ffn_up_exps,
  6649. model.layers[il].ffn_gate_exps,
  6650. model.layers[il].ffn_down_exps,
  6651. nullptr,
  6652. n_expert, n_expert_used,
  6653. LLM_FFN_SILU, false,
  6654. false, 0.0,
  6655. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  6656. il);
  6657. cb(moe_out, "ffn_moe_out", il);
  6658. // FFN shared expert
  6659. {
  6660. ggml_tensor * cur_gate_inp = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur);
  6661. cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
  6662. // sigmoid
  6663. ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
  6664. cb(cur_gate, "ffn_shexp_gate", il);
  6665. ggml_tensor * cur_ffn = build_ffn(cur,
  6666. model.layers[il].ffn_up_shexp, NULL, NULL,
  6667. model.layers[il].ffn_gate_shexp, NULL, NULL,
  6668. model.layers[il].ffn_down_shexp, NULL, NULL,
  6669. NULL,
  6670. LLM_FFN_SILU, LLM_FFN_PAR, il);
  6671. cb(cur_ffn, "ffn_shexp", il);
  6672. ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
  6673. cb(ffn_shexp_out, "ffn_shexp_out", il);
  6674. moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
  6675. cb(moe_out, "ffn_out", il);
  6676. cur = moe_out;
  6677. }
  6678. cur = ggml_add(ctx0, cur, ffn_inp);
  6679. cur = build_cvec(cur, il);
  6680. cb(cur, "l_out", il);
  6681. // input for next layer
  6682. inpL = cur;
  6683. }
  6684. cur = inpL;
  6685. cur = build_norm(cur,
  6686. model.output_norm, NULL,
  6687. LLM_NORM_RMS, -1);
  6688. cb(cur, "result_norm", -1);
  6689. res->t_embd = cur;
  6690. // lm_head
  6691. cur = build_lora_mm(model.output, cur);
  6692. cb(cur, "result_output", -1);
  6693. res->t_logits = cur;
  6694. ggml_build_forward_expand(gf, cur);
  6695. }
  6696. };
  6697. struct llm_build_qwen3 : public llm_graph_context {
  6698. llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6699. const int64_t n_embd_head = hparams.n_embd_head_v;
  6700. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6701. GGML_ASSERT(n_embd_head == hparams.n_rot);
  6702. ggml_tensor * cur;
  6703. ggml_tensor * inpL;
  6704. inpL = build_inp_embd(model.tok_embd);
  6705. // inp_pos - contains the positions
  6706. ggml_tensor * inp_pos = build_inp_pos();
  6707. auto * inp_attn = build_attn_inp_kv_unified();
  6708. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6709. for (int il = 0; il < n_layer; ++il) {
  6710. ggml_tensor * inpSA = inpL;
  6711. // norm
  6712. cur = build_norm(inpL,
  6713. model.layers[il].attn_norm, NULL,
  6714. LLM_NORM_RMS, il);
  6715. cb(cur, "attn_norm", il);
  6716. // self-attention
  6717. {
  6718. // compute Q and K and RoPE them
  6719. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6720. cb(Qcur, "Qcur", il);
  6721. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6722. cb(Kcur, "Kcur", il);
  6723. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6724. cb(Vcur, "Vcur", il);
  6725. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6726. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6727. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6728. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  6729. cb(Qcur, "Qcur_normed", il);
  6730. Qcur = ggml_rope_ext(
  6731. ctx0, Qcur, inp_pos, nullptr,
  6732. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6733. ext_factor, attn_factor, beta_fast, beta_slow
  6734. );
  6735. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  6736. cb(Kcur, "Kcur_normed", il);
  6737. Kcur = ggml_rope_ext(
  6738. ctx0, Kcur, inp_pos, nullptr,
  6739. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6740. ext_factor, attn_factor, beta_fast, beta_slow
  6741. );
  6742. cb(Qcur, "Qcur", il);
  6743. cb(Kcur, "Kcur", il);
  6744. cb(Vcur, "Vcur", il);
  6745. cur = build_attn(inp_attn,
  6746. model.layers[il].wo, model.layers[il].bo,
  6747. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6748. }
  6749. if (il == n_layer - 1 && inp_out_ids) {
  6750. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6751. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6752. }
  6753. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6754. cb(ffn_inp, "ffn_inp", il);
  6755. // feed-forward network
  6756. cur = build_norm(ffn_inp,
  6757. model.layers[il].ffn_norm, NULL,
  6758. LLM_NORM_RMS, il);
  6759. cb(cur, "ffn_norm", il);
  6760. cur = build_ffn(cur,
  6761. model.layers[il].ffn_up, NULL, NULL,
  6762. model.layers[il].ffn_gate, NULL, NULL,
  6763. model.layers[il].ffn_down, NULL, NULL,
  6764. NULL,
  6765. LLM_FFN_SILU, LLM_FFN_PAR, il);
  6766. cb(cur, "ffn_out", il);
  6767. cur = ggml_add(ctx0, cur, ffn_inp);
  6768. cur = build_cvec(cur, il);
  6769. cb(cur, "l_out", il);
  6770. // input for next layer
  6771. inpL = cur;
  6772. }
  6773. cur = inpL;
  6774. cur = build_norm(cur,
  6775. model.output_norm, NULL,
  6776. LLM_NORM_RMS, -1);
  6777. cb(cur, "result_norm", -1);
  6778. res->t_embd = cur;
  6779. // lm_head
  6780. cur = build_lora_mm(model.output, cur);
  6781. cb(cur, "result_output", -1);
  6782. res->t_logits = cur;
  6783. ggml_build_forward_expand(gf, cur);
  6784. }
  6785. };
  6786. struct llm_build_qwen3moe : public llm_graph_context {
  6787. llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6788. const int64_t n_embd_head = hparams.n_embd_head_v;
  6789. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6790. GGML_ASSERT(n_embd_head == hparams.n_rot);
  6791. ggml_tensor * cur;
  6792. ggml_tensor * inpL;
  6793. inpL = build_inp_embd(model.tok_embd);
  6794. // inp_pos - contains the positions
  6795. ggml_tensor * inp_pos = build_inp_pos();
  6796. auto * inp_attn = build_attn_inp_kv_unified();
  6797. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6798. for (int il = 0; il < n_layer; ++il) {
  6799. ggml_tensor * inpSA = inpL;
  6800. // norm
  6801. cur = build_norm(inpL,
  6802. model.layers[il].attn_norm, NULL,
  6803. LLM_NORM_RMS, il);
  6804. cb(cur, "attn_norm", il);
  6805. // self_attention
  6806. {
  6807. // compute Q and K and RoPE them
  6808. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  6809. cb(Qcur, "Qcur", il);
  6810. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  6811. cb(Kcur, "Kcur", il);
  6812. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  6813. cb(Vcur, "Vcur", il);
  6814. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6815. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6816. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6817. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  6818. cb(Qcur, "Qcur_normed", il);
  6819. Qcur = ggml_rope_ext(
  6820. ctx0, Qcur, inp_pos, nullptr,
  6821. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6822. ext_factor, attn_factor, beta_fast, beta_slow
  6823. );
  6824. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  6825. cb(Kcur, "Kcur_normed", il);
  6826. Kcur = ggml_rope_ext(
  6827. ctx0, Kcur, inp_pos, nullptr,
  6828. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6829. ext_factor, attn_factor, beta_fast, beta_slow
  6830. );
  6831. cb(Qcur, "Qcur", il);
  6832. cb(Kcur, "Kcur", il);
  6833. cb(Vcur, "Vcur", il);
  6834. cur = build_attn(inp_attn,
  6835. model.layers[il].wo, model.layers[il].bo,
  6836. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  6837. }
  6838. if (il == n_layer - 1 && inp_out_ids) {
  6839. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6840. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6841. }
  6842. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6843. cb(ffn_inp, "ffn_inp", il);
  6844. // MoE branch
  6845. cur = build_norm(ffn_inp,
  6846. model.layers[il].ffn_norm, NULL,
  6847. LLM_NORM_RMS, il);
  6848. cb(cur, "ffn_norm", il);
  6849. ggml_tensor * moe_out =
  6850. build_moe_ffn(cur,
  6851. model.layers[il].ffn_gate_inp,
  6852. model.layers[il].ffn_up_exps,
  6853. model.layers[il].ffn_gate_exps,
  6854. model.layers[il].ffn_down_exps,
  6855. nullptr,
  6856. n_expert, n_expert_used,
  6857. LLM_FFN_SILU, true,
  6858. false, 0.0,
  6859. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  6860. il);
  6861. cb(moe_out, "ffn_moe_out", il);
  6862. cur = moe_out;
  6863. cur = ggml_add(ctx0, cur, ffn_inp);
  6864. cur = build_cvec(cur, il);
  6865. cb(cur, "l_out", il);
  6866. // input for next layer
  6867. inpL = cur;
  6868. }
  6869. cur = inpL;
  6870. cur = build_norm(cur,
  6871. model.output_norm, NULL,
  6872. LLM_NORM_RMS, -1);
  6873. cb(cur, "result_norm", -1);
  6874. res->t_embd = cur;
  6875. // lm_head
  6876. cur = build_lora_mm(model.output, cur);
  6877. cb(cur, "result_output", -1);
  6878. res->t_logits = cur;
  6879. ggml_build_forward_expand(gf, cur);
  6880. }
  6881. };
  6882. struct llm_build_phi2 : public llm_graph_context {
  6883. llm_build_phi2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6884. const int64_t n_embd_head = hparams.n_embd_head_v;
  6885. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  6886. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6887. ggml_tensor * cur;
  6888. ggml_tensor * attn_norm_output;
  6889. ggml_tensor * ffn_output;
  6890. ggml_tensor * inpL;
  6891. inpL = build_inp_embd(model.tok_embd);
  6892. // inp_pos - contains the positions
  6893. ggml_tensor * inp_pos = build_inp_pos();
  6894. auto * inp_attn = build_attn_inp_kv_unified();
  6895. ggml_tensor * inp_out_ids = build_inp_out_ids();
  6896. for (int il = 0; il < n_layer; ++il) {
  6897. attn_norm_output = build_norm(inpL,
  6898. model.layers[il].attn_norm,
  6899. model.layers[il].attn_norm_b,
  6900. LLM_NORM, il);
  6901. cb(attn_norm_output, "attn_norm", il);
  6902. // self-attention
  6903. {
  6904. ggml_tensor * Qcur = nullptr;
  6905. ggml_tensor * Kcur = nullptr;
  6906. ggml_tensor * Vcur = nullptr;
  6907. if (model.layers[il].wqkv) {
  6908. cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
  6909. cb(cur, "wqkv", il);
  6910. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  6911. cb(cur, "bqkv", il);
  6912. Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  6913. Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  6914. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  6915. } else {
  6916. Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
  6917. Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
  6918. Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
  6919. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6920. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6921. }
  6922. cb(Qcur, "Qcur", il);
  6923. cb(Kcur, "Kcur", il);
  6924. cb(Vcur, "Vcur", il);
  6925. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  6926. Qcur = ggml_rope_ext(
  6927. ctx0, Qcur, inp_pos, nullptr,
  6928. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6929. ext_factor, attn_factor, beta_fast, beta_slow
  6930. );
  6931. Kcur = ggml_rope_ext(
  6932. ctx0, Kcur, inp_pos, nullptr,
  6933. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6934. ext_factor, attn_factor, beta_fast, beta_slow
  6935. );
  6936. cb(Qcur, "Qcur", il);
  6937. cb(Kcur, "Kcur", il);
  6938. cb(Vcur, "Vcur", il);
  6939. // with phi2, we scale the Q to avoid precision issues
  6940. // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
  6941. Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
  6942. cur = build_attn(inp_attn,
  6943. model.layers[il].wo, model.layers[il].bo,
  6944. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
  6945. }
  6946. if (il == n_layer - 1 && inp_out_ids) {
  6947. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6948. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6949. attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
  6950. }
  6951. // FF
  6952. {
  6953. ffn_output = build_ffn(attn_norm_output,
  6954. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  6955. NULL, NULL, NULL,
  6956. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  6957. NULL,
  6958. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  6959. cb(ffn_output, "ffn_out", il);
  6960. }
  6961. cur = ggml_add(ctx0, cur, ffn_output);
  6962. cur = ggml_add(ctx0, cur, inpL);
  6963. cur = build_cvec(cur, il);
  6964. cb(cur, "l_out", il);
  6965. // input for next layer
  6966. inpL = cur;
  6967. }
  6968. cur = build_norm(inpL,
  6969. model.output_norm,
  6970. model.output_norm_b,
  6971. LLM_NORM, -1);
  6972. cb(cur, "result_norm", -1);
  6973. res->t_embd = cur;
  6974. cur = build_lora_mm(model.output, cur);
  6975. cb(cur, "result_output_no_bias", -1);
  6976. cur = ggml_add(ctx0, cur, model.output_b);
  6977. cb(cur, "result_output", -1);
  6978. res->t_logits = cur;
  6979. ggml_build_forward_expand(gf, cur);
  6980. }
  6981. };
  6982. template<bool iswa>
  6983. struct llm_build_phi3 : public llm_graph_context {
  6984. llm_build_phi3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  6985. const int64_t n_embd_head = hparams.n_embd_head_v;
  6986. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  6987. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6988. ggml_tensor * cur;
  6989. ggml_tensor * inpL;
  6990. inpL = build_inp_embd(model.tok_embd);
  6991. // inp_pos - contains the positions
  6992. ggml_tensor * inp_pos = build_inp_pos();
  6993. using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
  6994. inp_attn_type * inp_attn = nullptr;
  6995. if constexpr (iswa) {
  6996. inp_attn = build_attn_inp_kv_unified_iswa();
  6997. } else {
  6998. inp_attn = build_attn_inp_kv_unified();
  6999. }
  7000. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7001. for (int il = 0; il < n_layer; ++il) {
  7002. auto * residual = inpL;
  7003. // self-attention
  7004. {
  7005. // rope freq factors for 128k context
  7006. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  7007. ggml_tensor* attn_norm_output = build_norm(inpL,
  7008. model.layers[il].attn_norm,
  7009. model.layers[il].attn_norm_b,
  7010. LLM_NORM_RMS, il);
  7011. cb(attn_norm_output, "attn_norm", il);
  7012. ggml_tensor * Qcur = nullptr;
  7013. ggml_tensor * Kcur = nullptr;
  7014. ggml_tensor * Vcur = nullptr;
  7015. if (model.layers[il].wqkv) {
  7016. cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
  7017. cb(cur, "wqkv", il);
  7018. Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
  7019. Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
  7020. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
  7021. } else {
  7022. Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
  7023. Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
  7024. Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
  7025. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7026. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7027. }
  7028. cb(Qcur, "Qcur", il);
  7029. cb(Kcur, "Kcur", il);
  7030. cb(Vcur, "Vcur", il);
  7031. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7032. Qcur = ggml_rope_ext(
  7033. ctx0, Qcur, inp_pos, rope_factors,
  7034. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7035. ext_factor, attn_factor, beta_fast, beta_slow
  7036. );
  7037. Kcur = ggml_rope_ext(
  7038. ctx0, Kcur, inp_pos, rope_factors,
  7039. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7040. ext_factor, attn_factor, beta_fast, beta_slow
  7041. );
  7042. cb(Qcur, "Qcur", il);
  7043. cb(Kcur, "Kcur", il);
  7044. cb(Vcur, "Vcur", il);
  7045. Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
  7046. cb(Qcur, "Qcur", il);
  7047. cur = build_attn(inp_attn,
  7048. model.layers[il].wo, model.layers[il].bo,
  7049. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
  7050. }
  7051. if (il == n_layer - 1 && inp_out_ids) {
  7052. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7053. residual = ggml_get_rows(ctx0, residual, inp_out_ids);
  7054. }
  7055. cur = ggml_add(ctx0, cur, residual);
  7056. residual = cur;
  7057. cur = build_norm(cur,
  7058. model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
  7059. LLM_NORM_RMS, il);
  7060. cb(cur, "ffn_norm", il);
  7061. // feed-forward network
  7062. if (model.layers[il].ffn_gate_inp == nullptr) {
  7063. cur = build_ffn(cur,
  7064. model.layers[il].ffn_up, NULL, NULL,
  7065. NULL, NULL, NULL,
  7066. model.layers[il].ffn_down, NULL, NULL,
  7067. NULL,
  7068. LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
  7069. cb(cur, "ffn_out", il);
  7070. } else {
  7071. // MoE branch
  7072. cur = build_moe_ffn(cur,
  7073. model.layers[il].ffn_gate_inp,
  7074. model.layers[il].ffn_up_exps,
  7075. model.layers[il].ffn_gate_exps,
  7076. model.layers[il].ffn_down_exps,
  7077. nullptr,
  7078. n_expert, n_expert_used,
  7079. LLM_FFN_SILU, true,
  7080. false, 0.0,
  7081. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  7082. il);
  7083. cb(cur, "ffn_moe_out", il);
  7084. }
  7085. cur = ggml_add(ctx0, residual, cur);
  7086. cur = build_cvec(cur, il);
  7087. cb(cur, "l_out", il);
  7088. // input for next layer
  7089. inpL = cur;
  7090. }
  7091. cur = build_norm(inpL,
  7092. model.output_norm,
  7093. model.output_norm_b,
  7094. LLM_NORM_RMS, -1);
  7095. cb(cur, "result_norm", -1);
  7096. res->t_embd = cur;
  7097. cur = build_lora_mm(model.output, cur);
  7098. if (model.output_b != nullptr) {
  7099. cb(cur, "result_output_no_bias", -1);
  7100. cur = ggml_add(ctx0, cur, model.output_b);
  7101. }
  7102. cb(cur, "result_output", -1);
  7103. res->t_logits = cur;
  7104. ggml_build_forward_expand(gf, cur);
  7105. }
  7106. };
  7107. struct llm_build_plamo : public llm_graph_context {
  7108. llm_build_plamo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7109. const int64_t n_embd_head = hparams.n_embd_head_v;
  7110. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7111. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7112. ggml_tensor * cur;
  7113. ggml_tensor * inpL;
  7114. inpL = build_inp_embd(model.tok_embd);
  7115. // inp_pos - contains the positions
  7116. ggml_tensor * inp_pos = build_inp_pos();
  7117. auto * inp_attn = build_attn_inp_kv_unified();
  7118. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7119. for (int il = 0; il < n_layer; ++il) {
  7120. // norm
  7121. cur = build_norm(inpL,
  7122. model.layers[il].attn_norm, NULL,
  7123. LLM_NORM_RMS, il);
  7124. cb(cur, "attn_norm", il);
  7125. ggml_tensor * sa_inp = cur;
  7126. // self-attention
  7127. {
  7128. // compute Q and K and RoPE them
  7129. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7130. cb(Qcur, "Qcur", il);
  7131. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7132. cb(Kcur, "Kcur", il);
  7133. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7134. cb(Vcur, "Vcur", il);
  7135. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7136. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7137. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7138. Qcur = ggml_rope_ext(
  7139. ctx0, Qcur, inp_pos, nullptr,
  7140. n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
  7141. ext_factor, attn_factor, beta_fast, beta_slow
  7142. );
  7143. Kcur = ggml_rope_ext(
  7144. ctx0, Kcur, inp_pos, nullptr,
  7145. n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
  7146. ext_factor, attn_factor, beta_fast, beta_slow
  7147. );
  7148. cb(Qcur, "Qcur", il);
  7149. cb(Kcur, "Kcur", il);
  7150. cb(Vcur, "Vcur", il);
  7151. cur = build_attn(inp_attn,
  7152. model.layers[il].wo, NULL,
  7153. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7154. }
  7155. if (il == n_layer - 1 && inp_out_ids) {
  7156. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7157. sa_inp = ggml_get_rows(ctx0, sa_inp, inp_out_ids);
  7158. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  7159. }
  7160. ggml_tensor * sa_out = cur;
  7161. cur = sa_inp;
  7162. // feed-forward network
  7163. {
  7164. cur = build_ffn(cur,
  7165. model.layers[il].ffn_up, NULL, NULL,
  7166. model.layers[il].ffn_gate, NULL, NULL,
  7167. model.layers[il].ffn_down, NULL, NULL,
  7168. NULL,
  7169. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7170. cb(cur, "ffn_out", il);
  7171. }
  7172. cur = ggml_add(ctx0, cur, sa_out);
  7173. cur = ggml_add(ctx0, cur, inpL);
  7174. cur = build_cvec(cur, il);
  7175. cb(cur, "l_out", il);
  7176. // input for next layer
  7177. inpL = cur;
  7178. }
  7179. cur = inpL;
  7180. cur = build_norm(cur,
  7181. model.output_norm, NULL,
  7182. LLM_NORM_RMS, -1);
  7183. cb(cur, "result_norm", -1);
  7184. res->t_embd = cur;
  7185. // lm_head
  7186. cur = build_lora_mm(model.output, cur);
  7187. cb(cur, "result_output", -1);
  7188. res->t_logits = cur;
  7189. ggml_build_forward_expand(gf, cur);
  7190. }
  7191. };
  7192. struct llm_build_gpt2 : public llm_graph_context {
  7193. llm_build_gpt2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7194. const int64_t n_embd_head = hparams.n_embd_head_v;
  7195. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  7196. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7197. ggml_tensor * cur;
  7198. ggml_tensor * pos;
  7199. ggml_tensor * inpL;
  7200. inpL = build_inp_embd(model.tok_embd);
  7201. // inp_pos - contains the positions
  7202. ggml_tensor * inp_pos = build_inp_pos();
  7203. auto * inp_attn = build_attn_inp_kv_unified();
  7204. pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
  7205. cb(pos, "pos_embd", -1);
  7206. inpL = ggml_add(ctx0, inpL, pos);
  7207. cb(inpL, "inpL", -1);
  7208. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7209. for (int il = 0; il < n_layer; ++il) {
  7210. cur = build_norm(inpL,
  7211. model.layers[il].attn_norm,
  7212. model.layers[il].attn_norm_b,
  7213. LLM_NORM, il);
  7214. cb(cur, "attn_norm", il);
  7215. // self-attention
  7216. {
  7217. cur = build_lora_mm(model.layers[il].wqkv, cur);
  7218. cb(cur, "wqkv", il);
  7219. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  7220. cb(cur, "bqkv", il);
  7221. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  7222. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  7223. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  7224. cb(Qcur, "Qcur", il);
  7225. cb(Kcur, "Kcur", il);
  7226. cb(Vcur, "Vcur", il);
  7227. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7228. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7229. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7230. cur = build_attn(inp_attn,
  7231. model.layers[il].wo, model.layers[il].bo,
  7232. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7233. }
  7234. if (il == n_layer - 1 && inp_out_ids) {
  7235. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7236. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  7237. }
  7238. // add the input
  7239. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  7240. cb(ffn_inp, "ffn_inp", il);
  7241. // FF
  7242. {
  7243. cur = build_norm(ffn_inp,
  7244. model.layers[il].ffn_norm,
  7245. model.layers[il].ffn_norm_b,
  7246. LLM_NORM, il);
  7247. cb(cur, "ffn_norm", il);
  7248. cur = build_ffn(cur,
  7249. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  7250. NULL, NULL, NULL,
  7251. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  7252. NULL,
  7253. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  7254. cb(cur, "ffn_out", il);
  7255. }
  7256. cur = ggml_add(ctx0, cur, ffn_inp);
  7257. cur = build_cvec(cur, il);
  7258. cb(cur, "l_out", il);
  7259. // input for next layer
  7260. inpL = cur;
  7261. }
  7262. cur = build_norm(inpL,
  7263. model.output_norm,
  7264. model.output_norm_b,
  7265. LLM_NORM, -1);
  7266. cb(cur, "result_norm", -1);
  7267. res->t_embd = cur;
  7268. cur = build_lora_mm(model.output, cur);
  7269. cb(cur, "result_output", -1);
  7270. res->t_logits = cur;
  7271. ggml_build_forward_expand(gf, cur);
  7272. }
  7273. };
  7274. struct llm_build_codeshell : public llm_graph_context {
  7275. llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7276. const int64_t n_embd_head = hparams.n_embd_head_v;
  7277. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  7278. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7279. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7280. ggml_tensor * cur;
  7281. ggml_tensor * inpL;
  7282. inpL = build_inp_embd(model.tok_embd);
  7283. // inp_pos - contains the positions
  7284. ggml_tensor * inp_pos = build_inp_pos();
  7285. auto * inp_attn = build_attn_inp_kv_unified();
  7286. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7287. for (int il = 0; il < n_layer; ++il) {
  7288. cur = build_norm(inpL,
  7289. model.layers[il].attn_norm,
  7290. model.layers[il].attn_norm_b,
  7291. LLM_NORM, il);
  7292. cb(cur, "attn_norm", il);
  7293. // self-attention
  7294. {
  7295. cur = build_lora_mm(model.layers[il].wqkv, cur);
  7296. cb(cur, "wqkv", il);
  7297. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  7298. cb(cur, "bqkv", il);
  7299. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  7300. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  7301. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  7302. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7303. Qcur = ggml_rope_ext(
  7304. ctx0, Qcur, inp_pos, nullptr,
  7305. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7306. ext_factor, attn_factor, beta_fast, beta_slow
  7307. );
  7308. Kcur = ggml_rope_ext(
  7309. ctx0, Kcur, inp_pos, nullptr,
  7310. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7311. ext_factor, attn_factor, beta_fast, beta_slow
  7312. );
  7313. cb(Qcur, "Qcur", il);
  7314. cb(Kcur, "Kcur", il);
  7315. cb(Vcur, "Vcur", il);
  7316. cur = build_attn(inp_attn,
  7317. model.layers[il].wo, model.layers[il].bo,
  7318. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7319. }
  7320. if (il == n_layer - 1 && inp_out_ids) {
  7321. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7322. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  7323. }
  7324. // add the input
  7325. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  7326. cb(ffn_inp, "ffn_inp", il);
  7327. // FF
  7328. {
  7329. cur = build_norm(ffn_inp,
  7330. model.layers[il].ffn_norm,
  7331. model.layers[il].ffn_norm_b,
  7332. LLM_NORM, il);
  7333. cb(cur, "ffn_norm", il);
  7334. cur = build_ffn(cur,
  7335. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  7336. NULL, NULL, NULL,
  7337. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  7338. NULL,
  7339. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  7340. cb(cur, "ffn_out", il);
  7341. }
  7342. cur = ggml_add(ctx0, cur, ffn_inp);
  7343. cur = build_cvec(cur, il);
  7344. cb(cur, "l_out", il);
  7345. // input for next layer
  7346. inpL = cur;
  7347. }
  7348. cur = build_norm(inpL,
  7349. model.output_norm,
  7350. model.output_norm_b,
  7351. LLM_NORM, -1);
  7352. cb(cur, "result_norm", -1);
  7353. res->t_embd = cur;
  7354. cur = build_lora_mm(model.output, cur);
  7355. cb(cur, "result_output", -1);
  7356. res->t_logits = cur;
  7357. ggml_build_forward_expand(gf, cur);
  7358. }
  7359. };
  7360. struct llm_build_orion : public llm_graph_context {
  7361. llm_build_orion(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7362. const int64_t n_embd_head = hparams.n_embd_head_v;
  7363. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7364. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7365. ggml_tensor * cur;
  7366. ggml_tensor * inpL;
  7367. inpL = build_inp_embd(model.tok_embd);
  7368. // inp_pos - contains the positions
  7369. ggml_tensor * inp_pos = build_inp_pos();
  7370. auto * inp_attn = build_attn_inp_kv_unified();
  7371. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7372. for (int il = 0; il < n_layer; ++il) {
  7373. ggml_tensor * inpSA = inpL;
  7374. // norm
  7375. cur = build_norm(inpL,
  7376. model.layers[il].attn_norm, model.layers[il].attn_norm_b,
  7377. LLM_NORM, il);
  7378. cb(cur, "attn_norm", il);
  7379. // self-attention
  7380. {
  7381. // compute Q and K and RoPE them
  7382. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7383. cb(Qcur, "Qcur", il);
  7384. // if (model.layers[il].bq) {
  7385. // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  7386. // cb(Qcur, "Qcur", il);
  7387. // }
  7388. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7389. cb(Kcur, "Kcur", il);
  7390. // if (model.layers[il].bk) {
  7391. // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  7392. // cb(Kcur, "Kcur", il);
  7393. // }
  7394. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7395. cb(Vcur, "Vcur", il);
  7396. // if (model.layers[il].bv) {
  7397. // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  7398. // cb(Vcur, "Vcur", il);
  7399. // }
  7400. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7401. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7402. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7403. Qcur = ggml_rope_ext(
  7404. ctx0, Qcur, inp_pos, nullptr,
  7405. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7406. ext_factor, attn_factor, beta_fast, beta_slow
  7407. );
  7408. Kcur = ggml_rope_ext(
  7409. ctx0, Kcur, inp_pos, nullptr,
  7410. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7411. ext_factor, attn_factor, beta_fast, beta_slow
  7412. );
  7413. cb(Qcur, "Qcur", il);
  7414. cb(Kcur, "Kcur", il);
  7415. cb(Vcur, "Vcur", il);
  7416. cur = build_attn(inp_attn,
  7417. model.layers[il].wo, NULL,
  7418. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7419. }
  7420. if (il == n_layer - 1 && inp_out_ids) {
  7421. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7422. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7423. }
  7424. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7425. cb(ffn_inp, "ffn_inp", il);
  7426. // feed-forward network
  7427. cur = build_norm(ffn_inp,
  7428. model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
  7429. LLM_NORM, il);
  7430. cb(cur, "ffn_norm", il);
  7431. cur = build_ffn(cur,
  7432. model.layers[il].ffn_up, NULL, NULL,
  7433. model.layers[il].ffn_gate, NULL, NULL,
  7434. model.layers[il].ffn_down, NULL, NULL,
  7435. NULL,
  7436. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7437. cb(cur, "ffn_out", il);
  7438. cur = ggml_add(ctx0, cur, ffn_inp);
  7439. cur = build_cvec(cur, il);
  7440. cb(cur, "l_out", il);
  7441. // input for next layer
  7442. inpL = cur;
  7443. }
  7444. cur = inpL;
  7445. cur = build_norm(cur,
  7446. model.output_norm, model.output_norm_b,
  7447. LLM_NORM, -1);
  7448. cb(cur, "result_norm", -1);
  7449. res->t_embd = cur;
  7450. // lm_head
  7451. cur = build_lora_mm(model.output, cur);
  7452. cb(cur, "result_output", -1);
  7453. res->t_logits = cur;
  7454. ggml_build_forward_expand(gf, cur);
  7455. }
  7456. };
  7457. struct llm_build_internlm2 : public llm_graph_context {
  7458. llm_build_internlm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7459. const int64_t n_embd_head = hparams.n_embd_head_v;
  7460. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7461. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7462. ggml_tensor * cur;
  7463. ggml_tensor * inpL;
  7464. inpL = build_inp_embd(model.tok_embd);
  7465. // inp_pos - contains the positions
  7466. ggml_tensor * inp_pos = build_inp_pos();
  7467. auto * inp_attn = build_attn_inp_kv_unified();
  7468. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7469. for (int il = 0; il < n_layer; ++il) {
  7470. ggml_tensor * inpSA = inpL;
  7471. // norm
  7472. cur = build_norm(inpL,
  7473. model.layers[il].attn_norm, NULL,
  7474. LLM_NORM_RMS, il);
  7475. cb(cur, "attn_norm", il);
  7476. // self-attention
  7477. {
  7478. // compute Q and K and RoPE them
  7479. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7480. cb(Qcur, "Qcur", il);
  7481. if (model.layers[il].bq) {
  7482. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  7483. cb(Qcur, "Qcur", il);
  7484. }
  7485. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7486. cb(Kcur, "Kcur", il);
  7487. if (model.layers[il].bk) {
  7488. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  7489. cb(Kcur, "Kcur", il);
  7490. }
  7491. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7492. cb(Vcur, "Vcur", il);
  7493. if (model.layers[il].bv) {
  7494. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  7495. cb(Vcur, "Vcur", il);
  7496. }
  7497. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7498. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7499. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7500. Qcur = ggml_rope_ext(
  7501. ctx0, Qcur, inp_pos, nullptr,
  7502. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7503. ext_factor, attn_factor, beta_fast, beta_slow
  7504. );
  7505. Kcur = ggml_rope_ext(
  7506. ctx0, Kcur, inp_pos, nullptr,
  7507. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7508. ext_factor, attn_factor, beta_fast, beta_slow
  7509. );
  7510. cb(Qcur, "Qcur", il);
  7511. cb(Kcur, "Kcur", il);
  7512. cb(Vcur, "Vcur", il);
  7513. cur = build_attn(inp_attn,
  7514. model.layers[il].wo, model.layers[il].bo,
  7515. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  7516. }
  7517. if (il == n_layer - 1 && inp_out_ids) {
  7518. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7519. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7520. }
  7521. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7522. cb(ffn_inp, "ffn_inp", il);
  7523. // feed-forward network
  7524. cur = build_norm(ffn_inp,
  7525. model.layers[il].ffn_norm, NULL,
  7526. LLM_NORM_RMS, il);
  7527. cb(cur, "ffn_norm", il);
  7528. cur = build_ffn(cur,
  7529. model.layers[il].ffn_up, NULL, NULL,
  7530. model.layers[il].ffn_gate, NULL, NULL,
  7531. model.layers[il].ffn_down, NULL, NULL,
  7532. NULL,
  7533. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7534. cb(cur, "ffn_out", il);
  7535. cur = ggml_add(ctx0, cur, ffn_inp);
  7536. cur = build_cvec(cur, il);
  7537. cb(cur, "l_out", il);
  7538. // input for next layer
  7539. inpL = cur;
  7540. }
  7541. cur = inpL;
  7542. cur = build_norm(cur,
  7543. model.output_norm, NULL,
  7544. LLM_NORM_RMS, -1);
  7545. cb(cur, "result_norm", -1);
  7546. res->t_embd = cur;
  7547. // lm_head
  7548. cur = build_lora_mm(model.output, cur);
  7549. cb(cur, "result_output", -1);
  7550. res->t_logits = cur;
  7551. ggml_build_forward_expand(gf, cur);
  7552. }
  7553. };
  7554. struct llm_build_minicpm3 : public llm_graph_context {
  7555. llm_build_minicpm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7556. //TODO: if the model varies, these parameters need to be read from the model
  7557. const int64_t n_embd_base = 256;
  7558. const float scale_embd = 12.0f;
  7559. const float scale_depth = 1.4f;
  7560. const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
  7561. const uint32_t n_embd_head_qk_rope = hparams.n_rot;
  7562. const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  7563. const uint32_t kv_lora_rank = hparams.n_lora_kv;
  7564. ggml_tensor * cur;
  7565. ggml_tensor * inpL;
  7566. inpL = build_inp_embd(model.tok_embd);
  7567. // scale the input embeddings
  7568. inpL = ggml_scale(ctx0, inpL, scale_embd);
  7569. cb(inpL, "inp_scaled", -1);
  7570. // inp_pos - contains the positions
  7571. ggml_tensor * inp_pos = build_inp_pos();
  7572. auto * inp_attn = build_attn_inp_kv_unified();
  7573. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7574. for (int il = 0; il < n_layer; ++il) {
  7575. ggml_tensor * inpSA = inpL;
  7576. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  7577. // norm
  7578. cur = build_norm(inpL,
  7579. model.layers[il].attn_norm, NULL,
  7580. LLM_NORM_RMS, il);
  7581. cb(cur, "attn_norm", il);
  7582. // self_attention
  7583. {
  7584. ggml_tensor * q = NULL;
  7585. // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
  7586. q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
  7587. cb(q, "q", il);
  7588. q = build_norm(q,
  7589. model.layers[il].attn_q_a_norm, NULL,
  7590. LLM_NORM_RMS, il);
  7591. cb(q, "q", il);
  7592. // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
  7593. q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
  7594. cb(q, "q", il);
  7595. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  7596. ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
  7597. ggml_row_size(q->type, hparams.n_embd_head_k),
  7598. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  7599. 0);
  7600. cb(q_nope, "q_nope", il);
  7601. // and {n_head * n_embd_head_qk_rope, n_tokens}
  7602. ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
  7603. ggml_row_size(q->type, hparams.n_embd_head_k),
  7604. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  7605. ggml_row_size(q->type, n_embd_head_qk_nope));
  7606. cb(q_pe, "q_pe", il);
  7607. // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
  7608. ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
  7609. cb(kv_pe_compresseed, "kv_pe_compresseed", il);
  7610. // split into {kv_lora_rank, n_tokens}
  7611. ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
  7612. kv_pe_compresseed->nb[1],
  7613. 0);
  7614. cb(kv_compressed, "kv_compressed", il);
  7615. // and {n_embd_head_qk_rope, n_tokens}
  7616. ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
  7617. kv_pe_compresseed->nb[1],
  7618. kv_pe_compresseed->nb[1],
  7619. ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
  7620. cb(k_pe, "k_pe", il);
  7621. kv_compressed = build_norm(kv_compressed,
  7622. model.layers[il].attn_kv_a_norm, NULL,
  7623. LLM_NORM_RMS, il);
  7624. cb(kv_compressed, "kv_compressed", il);
  7625. // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
  7626. ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
  7627. cb(kv, "kv", il);
  7628. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  7629. ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
  7630. ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
  7631. ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  7632. 0);
  7633. cb(k_nope, "k_nope", il);
  7634. // and {n_head * n_embd_head_v, n_tokens}
  7635. ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
  7636. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  7637. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
  7638. ggml_row_size(kv->type, (n_embd_head_qk_nope)));
  7639. cb(v_states, "v_states", il);
  7640. v_states = ggml_cont(ctx0, v_states);
  7641. cb(v_states, "v_states", il);
  7642. q_pe = ggml_rope_ext(
  7643. ctx0, q_pe, inp_pos, rope_factors,
  7644. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7645. ext_factor, attn_factor, beta_fast, beta_slow
  7646. );
  7647. cb(q_pe, "q_pe", il);
  7648. // shared RoPE key
  7649. k_pe = ggml_rope_ext(
  7650. ctx0, k_pe, inp_pos, rope_factors,
  7651. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7652. ext_factor, attn_factor, beta_fast, beta_slow
  7653. );
  7654. cb(k_pe, "k_pe", il);
  7655. ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
  7656. cb(q_states, "q_states", il);
  7657. ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
  7658. cb(k_states, "k_states", il);
  7659. cur = build_attn(inp_attn,
  7660. model.layers[il].wo, NULL,
  7661. q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
  7662. }
  7663. if (il == n_layer - 1 && inp_out_ids) {
  7664. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7665. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7666. }
  7667. // scale_res - scale the hidden states for residual connection
  7668. const float scale_res = scale_depth/sqrtf(float(n_layer)); // TODO: is this correct?
  7669. cur = ggml_scale(ctx0, cur, scale_res);
  7670. cb(cur, "hidden_scaled", il);
  7671. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7672. cb(ffn_inp, "ffn_inp", il);
  7673. // feed-forward network
  7674. {
  7675. cur = build_norm(ffn_inp,
  7676. model.layers[il].ffn_norm, NULL,
  7677. LLM_NORM_RMS, il);
  7678. cb(cur, "ffn_norm", il);
  7679. cur = build_ffn(cur,
  7680. model.layers[il].ffn_up, NULL, NULL,
  7681. model.layers[il].ffn_gate, NULL, NULL,
  7682. model.layers[il].ffn_down, NULL, NULL,
  7683. NULL,
  7684. LLM_FFN_SILU, LLM_FFN_PAR, il);
  7685. cb(cur, "ffn_out", il);
  7686. }
  7687. // scale the hidden states for residual connection
  7688. cur = ggml_scale(ctx0, cur, scale_res);
  7689. cb(cur, "hidden_scaled_ffn", il);
  7690. cur = ggml_add(ctx0, cur, ffn_inp);
  7691. cur = build_cvec(cur, il);
  7692. cb(cur, "l_out", il);
  7693. // input for next layer
  7694. inpL = cur;
  7695. }
  7696. cur = inpL;
  7697. cur = build_norm(cur,
  7698. model.output_norm, NULL,
  7699. LLM_NORM_RMS, -1);
  7700. cb(cur, "result_norm", -1);
  7701. res->t_embd = cur;
  7702. // lm_head scaling
  7703. const float scale_lmhead = float(n_embd_base)/float(n_embd);
  7704. cur = ggml_scale(ctx0, cur, scale_lmhead);
  7705. cb(cur, "lmhead_scaling", -1);
  7706. // lm_head
  7707. cur = build_lora_mm(model.output, cur);
  7708. cb(cur, "result_output", -1);
  7709. res->t_logits = cur;
  7710. ggml_build_forward_expand(gf, cur);
  7711. }
  7712. };
  7713. struct llm_build_gemma : public llm_graph_context {
  7714. llm_build_gemma(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7715. const int64_t n_embd_head = hparams.n_embd_head_v;
  7716. ggml_tensor * cur;
  7717. ggml_tensor * inpL;
  7718. inpL = build_inp_embd(model.tok_embd);
  7719. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  7720. cb(inpL, "inp_scaled", -1);
  7721. // inp_pos - contains the positions
  7722. ggml_tensor * inp_pos = build_inp_pos();
  7723. auto * inp_attn = build_attn_inp_kv_unified();
  7724. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7725. for (int il = 0; il < n_layer; ++il) {
  7726. // norm
  7727. cur = build_norm(inpL,
  7728. model.layers[il].attn_norm, NULL,
  7729. LLM_NORM_RMS, il);
  7730. cb(cur, "attn_norm", il);
  7731. // self-attention
  7732. {
  7733. // compute Q and K and RoPE them
  7734. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7735. cb(Qcur, "Qcur", il);
  7736. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7737. cb(Kcur, "Kcur", il);
  7738. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7739. cb(Vcur, "Vcur", il);
  7740. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7741. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7742. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7743. Qcur = ggml_rope_ext(
  7744. ctx0, Qcur, inp_pos, nullptr,
  7745. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7746. ext_factor, attn_factor, beta_fast, beta_slow);
  7747. Kcur = ggml_rope_ext(
  7748. ctx0, Kcur, inp_pos, nullptr,
  7749. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7750. ext_factor, attn_factor, beta_fast, beta_slow);
  7751. cb(Qcur, "Qcur", il);
  7752. cb(Kcur, "Kcur", il);
  7753. cb(Vcur, "Vcur", il);
  7754. Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
  7755. cb(Qcur, "Qcur_scaled", il);
  7756. cur = build_attn(inp_attn,
  7757. model.layers[il].wo, NULL,
  7758. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
  7759. }
  7760. if (il == n_layer - 1 && inp_out_ids) {
  7761. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7762. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  7763. }
  7764. ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
  7765. cb(sa_out, "sa_out", il);
  7766. cur = build_norm(sa_out,
  7767. model.layers[il].ffn_norm, NULL,
  7768. LLM_NORM_RMS, il);
  7769. cb(cur, "ffn_norm", il);
  7770. // feed-forward network
  7771. {
  7772. cur = build_ffn(cur,
  7773. model.layers[il].ffn_up, NULL, NULL,
  7774. model.layers[il].ffn_gate, NULL, NULL,
  7775. model.layers[il].ffn_down, NULL, NULL,
  7776. NULL,
  7777. LLM_FFN_GELU, LLM_FFN_PAR, il);
  7778. cb(cur, "ffn_out", il);
  7779. }
  7780. cur = ggml_add(ctx0, cur, sa_out);
  7781. cur = build_cvec(cur, il);
  7782. cb(cur, "l_out", il);
  7783. // input for next layer
  7784. inpL = cur;
  7785. }
  7786. cur = inpL;
  7787. cur = build_norm(cur,
  7788. model.output_norm, NULL,
  7789. LLM_NORM_RMS, -1);
  7790. cb(cur, "result_norm", -1);
  7791. res->t_embd = cur;
  7792. // lm_head
  7793. cur = build_lora_mm(model.output, cur);
  7794. cb(cur, "result_output", -1);
  7795. res->t_logits = cur;
  7796. ggml_build_forward_expand(gf, cur);
  7797. }
  7798. };
  7799. struct llm_build_gemma2_iswa : public llm_graph_context {
  7800. llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7801. const int64_t n_embd_head = hparams.n_embd_head_k;
  7802. ggml_tensor * cur;
  7803. ggml_tensor * inpL;
  7804. inpL = build_inp_embd(model.tok_embd);
  7805. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  7806. cb(inpL, "inp_scaled", -1);
  7807. // inp_pos - contains the positions
  7808. ggml_tensor * inp_pos = build_inp_pos();
  7809. auto * inp_attn = build_attn_inp_kv_unified_iswa();
  7810. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7811. for (int il = 0; il < n_layer; ++il) {
  7812. // norm
  7813. cur = build_norm(inpL,
  7814. model.layers[il].attn_norm, NULL,
  7815. LLM_NORM_RMS, il);
  7816. cb(cur, "attn_norm", il);
  7817. // self-attention
  7818. {
  7819. // compute Q and K and RoPE them
  7820. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7821. cb(Qcur, "Qcur", il);
  7822. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7823. cb(Kcur, "Kcur", il);
  7824. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7825. cb(Vcur, "Vcur", il);
  7826. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7827. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7828. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7829. Qcur = ggml_rope_ext(
  7830. ctx0, Qcur, inp_pos, nullptr,
  7831. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7832. ext_factor, attn_factor, beta_fast, beta_slow);
  7833. Kcur = ggml_rope_ext(
  7834. ctx0, Kcur, inp_pos, nullptr,
  7835. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7836. ext_factor, attn_factor, beta_fast, beta_slow);
  7837. cb(Qcur, "Qcur", il);
  7838. cb(Kcur, "Kcur", il);
  7839. cb(Vcur, "Vcur", il);
  7840. Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
  7841. cur = build_attn(inp_attn,
  7842. model.layers[il].wo, NULL,
  7843. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
  7844. }
  7845. if (il == n_layer - 1 && inp_out_ids) {
  7846. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7847. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  7848. }
  7849. cur = build_norm(cur,
  7850. model.layers[il].attn_post_norm, NULL,
  7851. LLM_NORM_RMS, il);
  7852. cb(cur, "attn_post_norm", il);
  7853. ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
  7854. cb(sa_out, "sa_out", il);
  7855. cur = build_norm(sa_out,
  7856. model.layers[il].ffn_norm, NULL,
  7857. LLM_NORM_RMS, il);
  7858. cb(cur, "ffn_norm", il);
  7859. // feed-forward network
  7860. {
  7861. cur = build_ffn(cur,
  7862. model.layers[il].ffn_up, NULL, NULL,
  7863. model.layers[il].ffn_gate, NULL, NULL,
  7864. model.layers[il].ffn_down, NULL, NULL,
  7865. NULL,
  7866. LLM_FFN_GELU, LLM_FFN_PAR, il);
  7867. cb(cur, "ffn_out", il);
  7868. }
  7869. cur = build_norm(cur,
  7870. model.layers[il].ffn_post_norm, NULL,
  7871. LLM_NORM_RMS, -1);
  7872. cb(cur, "ffn_post_norm", -1);
  7873. cur = ggml_add(ctx0, cur, sa_out);
  7874. cur = build_cvec(cur, il);
  7875. cb(cur, "l_out", il);
  7876. // input for next layer
  7877. inpL = cur;
  7878. }
  7879. cur = inpL;
  7880. cur = build_norm(cur,
  7881. model.output_norm, NULL,
  7882. LLM_NORM_RMS, -1);
  7883. cb(cur, "result_norm", -1);
  7884. res->t_embd = cur;
  7885. // lm_head
  7886. cur = build_lora_mm(model.output, cur);
  7887. // final logit soft-capping
  7888. cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
  7889. cur = ggml_tanh(ctx0, cur);
  7890. cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
  7891. cb(cur, "result_output", -1);
  7892. res->t_logits = cur;
  7893. ggml_build_forward_expand(gf, cur);
  7894. }
  7895. };
  7896. struct llm_build_gemma3_iswa : public llm_graph_context {
  7897. llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  7898. const int64_t n_embd_head = hparams.n_embd_head_k;
  7899. ggml_tensor * cur;
  7900. ggml_tensor * inpL;
  7901. inpL = build_inp_embd(model.tok_embd);
  7902. // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
  7903. if (ubatch.token) {
  7904. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  7905. cb(inpL, "inp_scaled", -1);
  7906. }
  7907. // inp_pos - contains the positions
  7908. ggml_tensor * inp_pos = build_inp_pos();
  7909. // TODO: is causal == true correct? might need some changes
  7910. auto * inp_attn = build_attn_inp_kv_unified_iswa();
  7911. ggml_tensor * inp_out_ids = build_inp_out_ids();
  7912. for (int il = 0; il < n_layer; ++il) {
  7913. const float freq_base_l = model.get_rope_freq_base (cparams, il);
  7914. const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
  7915. // norm
  7916. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  7917. cb(cur, "attn_norm", il);
  7918. // self-attention
  7919. {
  7920. // compute Q and K and RoPE them
  7921. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  7922. cb(Qcur, "Qcur", il);
  7923. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  7924. cb(Kcur, "Kcur", il);
  7925. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  7926. cb(Vcur, "Vcur", il);
  7927. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7928. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7929. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  7930. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  7931. cb(Qcur, "Qcur_normed", il);
  7932. Qcur = ggml_rope_ext(
  7933. ctx0, Qcur, inp_pos, nullptr,
  7934. n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
  7935. ext_factor, attn_factor, beta_fast, beta_slow);
  7936. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  7937. cb(Kcur, "Kcur_normed", il);
  7938. Kcur = ggml_rope_ext(
  7939. ctx0, Kcur, inp_pos, nullptr,
  7940. n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
  7941. ext_factor, attn_factor, beta_fast, beta_slow);
  7942. cb(Qcur, "Qcur", il);
  7943. cb(Kcur, "Kcur", il);
  7944. cb(Vcur, "Vcur", il);
  7945. // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
  7946. Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
  7947. cur = build_attn(inp_attn,
  7948. model.layers[il].wo, NULL,
  7949. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
  7950. }
  7951. if (il == n_layer - 1 && inp_out_ids) {
  7952. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7953. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  7954. }
  7955. cur = build_norm(cur,
  7956. model.layers[il].attn_post_norm, NULL,
  7957. LLM_NORM_RMS, il);
  7958. cb(cur, "attn_post_norm", il);
  7959. ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
  7960. cb(sa_out, "sa_out", il);
  7961. cur = build_norm(sa_out,
  7962. model.layers[il].ffn_norm, NULL,
  7963. LLM_NORM_RMS, il);
  7964. cb(cur, "ffn_norm", il);
  7965. // feed-forward network
  7966. {
  7967. cur = build_ffn(cur,
  7968. model.layers[il].ffn_up, NULL, NULL,
  7969. model.layers[il].ffn_gate, NULL, NULL,
  7970. model.layers[il].ffn_down, NULL, NULL,
  7971. NULL,
  7972. LLM_FFN_GELU, LLM_FFN_PAR, il);
  7973. cb(cur, "ffn_out", il);
  7974. }
  7975. cur = build_norm(cur,
  7976. model.layers[il].ffn_post_norm, NULL,
  7977. LLM_NORM_RMS, -1);
  7978. cb(cur, "ffn_post_norm", -1);
  7979. cur = ggml_add(ctx0, cur, sa_out);
  7980. cur = build_cvec(cur, il);
  7981. cb(cur, "l_out", il);
  7982. // input for next layer
  7983. inpL = cur;
  7984. }
  7985. cur = inpL;
  7986. cur = build_norm(cur,
  7987. model.output_norm, NULL,
  7988. LLM_NORM_RMS, -1);
  7989. cb(cur, "result_norm", -1);
  7990. res->t_embd = cur;
  7991. // lm_head
  7992. cur = build_lora_mm(model.output, cur);
  7993. cb(cur, "result_output", -1);
  7994. res->t_logits = cur;
  7995. ggml_build_forward_expand(gf, cur);
  7996. }
  7997. };
  7998. struct llm_build_gemma3n_iswa : public llm_graph_context {
  7999. const llama_model & model;
  8000. const int64_t n_embd_head;
  8001. const int64_t n_embd_altup;
  8002. const int64_t n_altup;
  8003. const int i_altup_act;
  8004. const int n_layer_kv = 20; // number of layers having KV [KV_REUSE]
  8005. const int n_layer_sparsity = 10; // number of layers using activation sparsity
  8006. const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
  8007. llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params)
  8008. : llm_graph_context(params),
  8009. model(model),
  8010. n_embd_head(model.hparams.n_embd_head_k),
  8011. n_embd_altup(model.hparams.n_embd_altup),
  8012. n_altup(model.hparams.n_altup),
  8013. i_altup_act(model.hparams.i_altup_act) {
  8014. ggml_tensor * cur;
  8015. ggml_tensor * inpL;
  8016. inpL = build_inp_embd(model.tok_embd);
  8017. // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
  8018. if (ubatch.token) {
  8019. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  8020. cb(inpL, "inp_scaled", -1);
  8021. }
  8022. // inp_pos - contains the positions
  8023. ggml_tensor * inp_pos = build_inp_pos();
  8024. // TODO: is causal == true correct? might need some changes
  8025. auto * inp_attn = build_attn_inp_kv_unified_iswa();
  8026. // inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
  8027. ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
  8028. // inpL now has only 1 altup, project it to the rest of the altups
  8029. // these "added" altups will be concat to the last dim of inpL
  8030. {
  8031. ggml_tensor * target_magnitude = calc_magnitude(inpL);
  8032. ggml_tensor * inp_repeated = ggml_repeat_4d(ctx0, inpL, n_embd, n_tokens, n_altup - 1, 1);
  8033. ggml_tensor * altup_added = ggml_mul_mat(ctx0, model.altup_proj, inp_repeated); // shape: [n_embd, n_tokens, n_altup - 1]
  8034. ggml_tensor * new_magnitude = calc_magnitude(altup_added);
  8035. altup_added = ggml_div(ctx0,
  8036. ggml_mul(ctx0, altup_added, target_magnitude),
  8037. new_magnitude);
  8038. inpL = ggml_concat(ctx0, inpL, altup_added, 2); // shape: [n_embd, n_tokens, n_altup]
  8039. cb(inpL, "inp_stacked", -1);
  8040. }
  8041. // inpL now has shape: [n_embd, n_tokens, n_altup]
  8042. // inp_per_layer now has shape: [n_embd_altup, n_tokens, n_layer]
  8043. for (int il = 0; il < n_layer; ++il) {
  8044. // this block is made to be closely resemble Gemma3p5DecoderLayer on python code
  8045. const bool has_kv = (il < n_layer_kv);
  8046. const float freq_base_l = model.get_rope_freq_base (cparams, il);
  8047. const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
  8048. ggml_tensor * cur = inpL; // [n_embd, n_tokens, n_altup]
  8049. ggml_tensor * predictions = altup_predict(cur, il); // [n_embd, n_tokens, n_altup]
  8050. // predicted value will go through self-attention and laurel
  8051. ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act); // [n_embd, n_tokens]
  8052. cur = active_prediction;
  8053. cb(cur, "active_prediction", il);
  8054. // norm
  8055. cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  8056. cb(cur, "attn_norm", il);
  8057. // laurel
  8058. ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens]
  8059. // self-attention
  8060. if (has_kv) {
  8061. // compute Q and K and RoPE them
  8062. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8063. cb(Qcur, "Qcur", il);
  8064. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8065. cb(Kcur, "Kcur", il);
  8066. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8067. cb(Vcur, "Vcur", il);
  8068. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8069. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8070. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8071. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  8072. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  8073. Vcur = ggml_rms_norm(ctx0, Vcur, hparams.f_norm_rms_eps);
  8074. cb(Qcur, "Qcur_normed", il);
  8075. cb(Kcur, "Kcur_normed", il);
  8076. cb(Vcur, "Vcur_normed", il);
  8077. Qcur = ggml_rope_ext(
  8078. ctx0, Qcur, inp_pos, nullptr,
  8079. n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
  8080. ext_factor, attn_factor, beta_fast, beta_slow);
  8081. Kcur = ggml_rope_ext(
  8082. ctx0, Kcur, inp_pos, nullptr,
  8083. n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
  8084. ext_factor, attn_factor, beta_fast, beta_slow);
  8085. cb(Qcur, "Qcur_pos", il);
  8086. cb(Kcur, "Kcur_pos", il);
  8087. cur = build_attn(inp_attn,
  8088. model.layers[il].wo, NULL,
  8089. Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
  8090. } else {
  8091. // no KV layers
  8092. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8093. cb(Qcur, "Qcur", il);
  8094. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8095. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  8096. cb(Qcur, "Qcur_normed", il);
  8097. Qcur = ggml_rope_ext(
  8098. ctx0, Qcur, inp_pos, nullptr,
  8099. n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
  8100. ext_factor, attn_factor, beta_fast, beta_slow);
  8101. cb(Qcur, "Qcur_pos", il);
  8102. cur = build_attn(inp_attn,
  8103. model.layers[il].wo, NULL,
  8104. Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
  8105. }
  8106. cur = build_norm(cur,
  8107. model.layers[il].attn_post_norm, NULL,
  8108. LLM_NORM_RMS, il);
  8109. cb(cur, "attn_post_norm", il);
  8110. cur = ggml_add(ctx0, cur, active_prediction); // [n_embd, n_tokens]
  8111. cb(cur, "attn_gated", il);
  8112. ggml_tensor * attn_laurel = ggml_scale(ctx0,
  8113. ggml_add(ctx0, cur, laurel_out),
  8114. 1.0f / sqrtf(2.0f)); // [n_embd, n_tokens]
  8115. cb(attn_laurel, "attn_laurel", il);
  8116. cur = build_norm(attn_laurel,
  8117. model.layers[il].ffn_norm, NULL,
  8118. LLM_NORM_RMS, il);
  8119. cb(cur, "ffn_norm", il);
  8120. // feed-forward network
  8121. {
  8122. ggml_tensor * up_proj = build_lora_mm(model.layers[il].ffn_up, cur);
  8123. ggml_tensor * gate_proj = build_lora_mm(model.layers[il].ffn_gate, cur);
  8124. if (il < n_layer_sparsity) {
  8125. // apply activation sparsity
  8126. gate_proj = gaussian_topk(gate_proj);
  8127. }
  8128. gate_proj = ggml_gelu(ctx0, gate_proj);
  8129. cur = ggml_mul(ctx0, up_proj, gate_proj);
  8130. cur = build_lora_mm(model.layers[il].ffn_down, cur);
  8131. cb(cur, "ffn_out", il);
  8132. }
  8133. cur = build_norm(cur,
  8134. model.layers[il].ffn_post_norm, NULL,
  8135. LLM_NORM_RMS, -1);
  8136. cb(cur, "ffn_post_norm", il);
  8137. ggml_tensor * attn_ffw_laurel_gated = ggml_add(ctx0, cur, attn_laurel); // [n_embd, n_tokens]
  8138. cb(attn_ffw_laurel_gated, "attn_ffw_laurel_gated", il);
  8139. ggml_tensor * corrected = altup_correct(predictions, attn_ffw_laurel_gated, il); // [n_embd, n_tokens, n_altup]
  8140. ggml_tensor * first_prediction; // [n_embd, n_tokens]
  8141. {
  8142. first_prediction = view_2d_slice(corrected, i_altup_act); // [n_embd, n_tokens]
  8143. first_prediction = ggml_mul(ctx0, first_prediction, model.layers[il].altup_correct_scale);
  8144. first_prediction = build_lora_mm(model.layers[il].per_layer_inp_gate, first_prediction);
  8145. first_prediction = ggml_gelu(ctx0, first_prediction); // [n_embd_altup, n_tokens]
  8146. cb(first_prediction, "first_prediction_gated", il);
  8147. ggml_tensor * inp_this_layer = view_2d_slice(inp_per_layer, il); // [n_embd_altup, n_tokens]
  8148. first_prediction = ggml_mul(ctx0, first_prediction, inp_this_layer); // [n_embd_altup, n_tokens]
  8149. cb(first_prediction, "first_prediction_scaled", il);
  8150. first_prediction = build_lora_mm(model.layers[il].per_layer_proj, first_prediction); // [n_embd, n_tokens]
  8151. first_prediction = build_norm(first_prediction,
  8152. model.layers[il].per_layer_post_norm, NULL,
  8153. LLM_NORM_RMS, il);
  8154. cb(first_prediction, "first_prediction_out", il);
  8155. }
  8156. // equivalent to python code: corrected_predictions[1:] += first_prediction
  8157. {
  8158. ggml_tensor * slice_first = view_2d_slice(corrected, 0);
  8159. ggml_tensor * slice_rest = ggml_view_3d(ctx0, corrected, n_embd, n_tokens, n_altup - 1,
  8160. ggml_row_size(corrected->type, n_embd),
  8161. ggml_row_size(corrected->type, n_embd*n_tokens),
  8162. n_embd*n_tokens*ggml_element_size(corrected));
  8163. ggml_tensor * tmp = ggml_add(ctx0, slice_rest, first_prediction); // [n_embd, n_tokens, n_altup - 1]
  8164. corrected = ggml_concat(ctx0, slice_first, tmp, 2); // [n_embd, n_tokens, n_altup]
  8165. }
  8166. cur = corrected; // [n_embd, n_tokens, n_altup]
  8167. cur = build_cvec(cur, il);
  8168. cb(cur, "l_out", il);
  8169. // input for next layer
  8170. inpL = cur;
  8171. }
  8172. cur = inpL; // [n_embd, n_tokens, n_altup]
  8173. // cur now has multiple altup(s), we want to merge them back to 1 altup
  8174. {
  8175. ggml_tensor * target_magnitude = calc_magnitude(view_2d_slice(cur, i_altup_act)); // [n_embd, n_tokens]
  8176. // do a view to skip the first slice (active altup)
  8177. ggml_tensor * alt_slice = ggml_view_3d(ctx0, cur, n_embd, n_tokens, n_altup - 1,
  8178. ggml_row_size(cur->type, n_embd),
  8179. ggml_row_size(cur->type, n_embd*n_tokens),
  8180. n_embd*n_tokens*ggml_element_size(cur));
  8181. ggml_tensor * altup_unembd = ggml_mul_mat(ctx0, model.altup_unembd_proj, alt_slice); // shape: [n_embd, n_tokens, n_altup - 1]
  8182. ggml_tensor * new_magnitude = calc_magnitude(altup_unembd);
  8183. altup_unembd = ggml_div(ctx0,
  8184. ggml_mul(ctx0, altup_unembd, target_magnitude),
  8185. new_magnitude);
  8186. cb(altup_unembd, "altup_unembd", -1);
  8187. // equivalent to torch.mean(hidden_states, dim=0)
  8188. cur = view_2d_slice(cur, 0); // [n_embd, n_tokens]
  8189. for (int i = 0; i < n_altup - 1; ++i) {
  8190. cur = ggml_add(ctx0, cur, view_2d_slice(altup_unembd, i));
  8191. }
  8192. cur = ggml_scale(ctx0, cur, 1.0f / float(n_altup)); // [n_embd, n_tokens]
  8193. cb(cur, "unembd_merged", -1);
  8194. }
  8195. // cur now has shape: [n_embd, n_tokens]
  8196. // TODO: move this to right after the last KV layer
  8197. {
  8198. // skip computing output for unused tokens
  8199. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8200. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8201. }
  8202. cur = build_norm(cur,
  8203. model.output_norm, NULL,
  8204. LLM_NORM_RMS, -1);
  8205. cb(cur, "result_norm", -1);
  8206. res->t_embd = cur;
  8207. cur = build_lora_mm(model.output, cur);
  8208. {
  8209. // final logit soft-capping
  8210. cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
  8211. cur = ggml_tanh(ctx0, cur);
  8212. cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
  8213. }
  8214. cb(cur, "result_output", -1);
  8215. res->t_logits = cur;
  8216. ggml_build_forward_expand(gf, cur);
  8217. }
  8218. ggml_tensor * calc_magnitude(ggml_tensor * x) {
  8219. return ggml_sqrt(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, x)));
  8220. }
  8221. // get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim
  8222. ggml_tensor * view_2d_slice(ggml_tensor * x, int idx) {
  8223. GGML_ASSERT(idx < (int)x->ne[2]);
  8224. return ggml_view_2d(ctx0, x, x->ne[0], x->ne[1],
  8225. ggml_row_size(x->type, x->ne[0]),
  8226. idx * x->ne[0] * x->ne[1] * ggml_element_size(x));
  8227. }
  8228. // equivalent to get_per_layer_inputs() in python code
  8229. // output shape: [n_embd_altup, n_layer, n_tokens]
  8230. ggml_tensor * get_per_layer_inputs() {
  8231. auto inp = std::make_unique<llm_graph_input_embd>();
  8232. ggml_tensor * inp_per_layer;
  8233. if (ubatch.token) {
  8234. inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
  8235. ggml_set_input(inp->tokens);
  8236. res->t_tokens = inp->tokens;
  8237. inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
  8238. inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
  8239. inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float)n_embd_altup));
  8240. cb(inp_per_layer, "inp_per_layer_selected", -1);
  8241. } else {
  8242. GGML_ABORT("TODO: support embd input");
  8243. }
  8244. res->add_input(std::move(inp));
  8245. return inp_per_layer;
  8246. }
  8247. // equivalent to project_per_layer_inputs() in python code
  8248. // this calculates the per-layer inputs, so the final tensor shape will have n_layer as the last dim
  8249. // output shape: [n_embd_altup, n_tokens, n_layer]
  8250. ggml_tensor * project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer) {
  8251. const float per_layer_projection_scale = 1.0f / sqrtf((float)n_embd);
  8252. const float per_layer_input_scale = 1.0f / sqrtf(2.0f);
  8253. ggml_tensor * per_layer_proj = ggml_mul_mat(ctx0, model.per_layer_model_proj, inputs_embeds);
  8254. per_layer_proj = ggml_scale(ctx0, per_layer_proj, per_layer_projection_scale);
  8255. per_layer_proj = ggml_reshape_3d(ctx0, per_layer_proj, n_embd_altup, n_layer, n_tokens);
  8256. per_layer_proj = build_norm(per_layer_proj,
  8257. model.per_layer_proj_norm, NULL,
  8258. LLM_NORM_RMS, -1); // [n_embd_altup, n_layer, n_tokens]
  8259. cb(per_layer_proj, "per_layer_proj", -1);
  8260. inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
  8261. inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
  8262. cb(inp_per_layer, "inp_per_layer", -1);
  8263. // permute to shape: [n_embd_altup, n_tokens, n_layer]
  8264. inp_per_layer = ggml_cont(ctx0, ggml_permute(ctx0, inp_per_layer, 0, 2, 1, 3));
  8265. return inp_per_layer;
  8266. }
  8267. // input cur shape: [n_altup, n_tokens]
  8268. // output shape: [n_altup, n_tokens]
  8269. ggml_tensor * laurel(ggml_tensor * cur, int il) {
  8270. ggml_tensor * tmp = cur;
  8271. tmp = build_lora_mm(model.layers[il].laurel_l, tmp);
  8272. tmp = build_lora_mm(model.layers[il].laurel_r, tmp);
  8273. tmp = build_norm(tmp, model.layers[il].laurel_post_norm, NULL, LLM_NORM_RMS, il);
  8274. tmp = ggml_add(ctx0, tmp, cur);
  8275. cb(tmp, "laurel_out", il);
  8276. return tmp;
  8277. }
  8278. // input x shape: [n_embd, n_tokens]
  8279. // output shape: [n_embd, n_tokens]
  8280. ggml_tensor * gaussian_topk(ggml_tensor * x) {
  8281. ggml_tensor * mean = ggml_mean(ctx0, x);
  8282. ggml_tensor * std = ggml_sqrt(ctx0, ggml_scale(ctx0,
  8283. ggml_sum_rows(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x, mean))),
  8284. 1.0f / (float)(x->ne[0] - 1)
  8285. ));
  8286. ggml_tensor * cutoff_x = ggml_add(ctx0, mean, ggml_scale(ctx0, std, f_sparsity_std_mul));
  8287. return ggml_relu(ctx0, ggml_sub(ctx0, x, cutoff_x));
  8288. }
  8289. //
  8290. // altup functions
  8291. //
  8292. // equivalent to compute_router_modalities() in python code
  8293. // input x shape: [n_embd, n_tokens]
  8294. // output shape: [n_altup, n_tokens]
  8295. ggml_tensor * altup_compute_router_modalities(ggml_tensor * x, int il) {
  8296. ggml_tensor * router_inputs = build_norm(x,
  8297. model.layers[il].altup_router_norm, NULL,
  8298. LLM_NORM_RMS, il);
  8299. // router_input_scale
  8300. router_inputs = ggml_scale(ctx0, router_inputs, 1.0f / (float)n_embd);
  8301. ggml_tensor * output = ggml_mul_mat(ctx0, model.layers[il].altup_router, router_inputs);
  8302. return ggml_tanh(ctx0, output); // [n_altup, n_tokens]
  8303. }
  8304. // input cur shape: [n_embd, n_tokens, n_altup]
  8305. // output shape: [n_embd, n_tokens, n_altup]
  8306. ggml_tensor * altup_predict(ggml_tensor * cur, int il) {
  8307. ggml_tensor * activated = view_2d_slice(cur, i_altup_act); // [n_embd, n_tokens]
  8308. ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
  8309. cb(modalities, "modalities", il);
  8310. ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_predict_coef, modalities);
  8311. cb(all_coefs, "all_coefs", il);
  8312. // first dim now having n_altup^2 elements, we reshape it to 2D (so we end up with 3D tensor)
  8313. all_coefs = ggml_reshape_3d(ctx0, all_coefs, n_altup, n_altup, n_tokens);
  8314. // permute to [n_altup, n_embd, n_tokens]
  8315. ggml_tensor * cur_permuted = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
  8316. ggml_tensor * predictions = ggml_mul_mat(ctx0, cur_permuted, all_coefs); // [n_altup, n_embd, n_tokens]
  8317. // final shape must be the same as cur: [n_embd, n_tokens, n_altup]
  8318. predictions = ggml_cont(ctx0, ggml_permute(ctx0, predictions, 0, 2, 1, 3));
  8319. predictions = ggml_add(ctx0, predictions, cur);
  8320. cb(predictions, "predictions", il);
  8321. return predictions;
  8322. }
  8323. // input predictions shape: [n_embd, n_tokens, n_altup]
  8324. // input activated shape: [n_embd, n_tokens]
  8325. // output shape: [n_embd, n_tokens, n_altup]
  8326. ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il) {
  8327. ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
  8328. cb(modalities, "modalities", il);
  8329. ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act);
  8330. ggml_tensor * innovation = ggml_sub(ctx0, activated, active_prediction); // [n_embd, n_tokens]
  8331. cb(innovation, "innovation", il);
  8332. ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
  8333. all_coefs = ggml_scale_bias(ctx0, all_coefs, 1.0f, 1.0f); // + 1.0
  8334. cb(all_coefs, "all_coefs", il);
  8335. all_coefs = ggml_cont(ctx0, ggml_transpose(ctx0, all_coefs)); // [n_tokens, n_altup]
  8336. all_coefs = ggml_reshape_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
  8337. innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1);
  8338. ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup]
  8339. corrected = ggml_add(ctx0, corrected, predictions); // [n_embd, n_tokens, n_altup]
  8340. cb(corrected, "corrected", il);
  8341. return corrected;
  8342. }
  8343. };
  8344. // TODO: move up next to build_starcoder
  8345. struct llm_build_starcoder2 : public llm_graph_context {
  8346. llm_build_starcoder2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  8347. const int64_t n_embd_head = hparams.n_embd_head_v;
  8348. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8349. GGML_ASSERT(n_embd_head == hparams.n_rot);
  8350. ggml_tensor * cur;
  8351. ggml_tensor * inpL;
  8352. inpL = build_inp_embd(model.tok_embd);
  8353. // inp_pos - contains the positions
  8354. ggml_tensor * inp_pos = build_inp_pos();
  8355. auto * inp_attn = build_attn_inp_kv_unified();
  8356. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8357. for (int il = 0; il < n_layer; ++il) {
  8358. ggml_tensor * inpSA = inpL;
  8359. // norm
  8360. cur = build_norm(inpL,
  8361. model.layers[il].attn_norm, model.layers[il].attn_norm_b,
  8362. LLM_NORM, il);
  8363. cb(cur, "attn_norm", il);
  8364. // self-attention
  8365. {
  8366. // compute Q and K and RoPE them
  8367. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8368. cb(Qcur, "Qcur", il);
  8369. if (model.layers[il].bq) {
  8370. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  8371. cb(Qcur, "Qcur", il);
  8372. }
  8373. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8374. cb(Kcur, "Kcur", il);
  8375. if (model.layers[il].bk) {
  8376. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  8377. cb(Kcur, "Kcur", il);
  8378. }
  8379. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8380. cb(Vcur, "Vcur", il);
  8381. if (model.layers[il].bv) {
  8382. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  8383. cb(Vcur, "Vcur", il);
  8384. }
  8385. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8386. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8387. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8388. Qcur = ggml_rope_ext(
  8389. ctx0, Qcur, inp_pos, nullptr,
  8390. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8391. ext_factor, attn_factor, beta_fast, beta_slow
  8392. );
  8393. Kcur = ggml_rope_ext(
  8394. ctx0, Kcur, inp_pos, nullptr,
  8395. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8396. ext_factor, attn_factor, beta_fast, beta_slow
  8397. );
  8398. cb(Qcur, "Qcur", il);
  8399. cb(Kcur, "Kcur", il);
  8400. cb(Vcur, "Vcur", il);
  8401. cur = build_attn(inp_attn,
  8402. model.layers[il].wo, model.layers[il].bo,
  8403. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  8404. }
  8405. if (il == n_layer - 1 && inp_out_ids) {
  8406. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8407. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  8408. }
  8409. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  8410. cb(ffn_inp, "ffn_inp", il);
  8411. // feed-forward network
  8412. cur = build_norm(ffn_inp,
  8413. model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
  8414. LLM_NORM, il);
  8415. cb(cur, "ffn_norm", il);
  8416. cur = build_ffn(cur,
  8417. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  8418. NULL, NULL, NULL,
  8419. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  8420. NULL,
  8421. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  8422. cb(cur, "ffn_out", il);
  8423. cur = ggml_add(ctx0, cur, ffn_inp);
  8424. cur = build_cvec(cur, il);
  8425. cb(cur, "l_out", il);
  8426. // input for next layer
  8427. inpL = cur;
  8428. }
  8429. cur = inpL;
  8430. cur = build_norm(cur,
  8431. model.output_norm, model.output_norm_b,
  8432. LLM_NORM, -1);
  8433. cb(cur, "result_norm", -1);
  8434. res->t_embd = cur;
  8435. // lm_head
  8436. cur = build_lora_mm(model.output, cur);
  8437. cb(cur, "result_output", -1);
  8438. res->t_logits = cur;
  8439. ggml_build_forward_expand(gf, cur);
  8440. }
  8441. };
  8442. struct llm_graph_context_mamba : public llm_graph_context {
  8443. llm_graph_context_mamba(const llm_graph_params & params) : llm_graph_context(params) {}
  8444. ggml_tensor * build_mamba_layer(
  8445. llm_graph_input_rs * inp,
  8446. ggml_tensor * cur,
  8447. const llama_model & model,
  8448. const llama_ubatch & ubatch,
  8449. int il) {
  8450. const auto * mctx_cur = inp->mctx;
  8451. const auto kv_head = mctx_cur->get_head();
  8452. const auto & layer = model.layers[il];
  8453. const int64_t d_conv = hparams.ssm_d_conv;
  8454. const int64_t d_inner = hparams.ssm_d_inner;
  8455. const int64_t d_state = hparams.ssm_d_state;
  8456. const int64_t dt_rank = hparams.ssm_dt_rank;
  8457. const int64_t n_head = d_inner;
  8458. const int64_t head_dim = 1;
  8459. const int64_t n_seqs = ubatch.n_seqs;
  8460. // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
  8461. const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
  8462. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  8463. GGML_ASSERT(n_seqs != 0);
  8464. GGML_ASSERT(ubatch.equal_seqs());
  8465. GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
  8466. ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
  8467. ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
  8468. ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
  8469. conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
  8470. // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
  8471. cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
  8472. // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
  8473. ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur);
  8474. // split the above in two
  8475. // => {d_inner, n_seq_tokens, n_seqs}
  8476. ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
  8477. ggml_tensor * z = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz));
  8478. // conv
  8479. {
  8480. // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
  8481. ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
  8482. // copy last (d_conv - 1) columns back into the state cache
  8483. ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
  8484. ggml_build_forward_expand(gf,
  8485. ggml_cpy(ctx0, last_conv,
  8486. ggml_view_1d(ctx0, conv_states_all,
  8487. (d_conv - 1)*(d_inner)*(n_seqs),
  8488. kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
  8489. // 1D convolution
  8490. // The equivalent is to make a self-overlapping view of conv_x
  8491. // over d_conv columns at each stride in the 3rd dimension,
  8492. // then element-wise multiply that with the conv1d weight,
  8493. // then sum the elements of each row,
  8494. // (the last two steps are a dot product over rows (also doable with mul_mat))
  8495. // then permute away the ne[0] dimension,
  8496. // and then you're left with the resulting x tensor.
  8497. // For simultaneous sequences, all sequences need to have the same length.
  8498. x = ggml_ssm_conv(ctx0, conv_x, layer.ssm_conv1d);
  8499. // bias
  8500. x = ggml_add(ctx0, x, layer.ssm_conv1d_b);
  8501. x = ggml_silu(ctx0, x);
  8502. }
  8503. // ssm
  8504. {
  8505. // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
  8506. ggml_tensor * x_db = build_lora_mm(layer.ssm_x, x);
  8507. // split
  8508. ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
  8509. ggml_tensor * B = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
  8510. ggml_tensor * C = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
  8511. // Some Mamba variants (e.g. FalconMamba, Jamba) apply RMS norm in B, C & Dt layers
  8512. if (ssm_dt_b_c_rms || (layer.ssm_dt_norm && layer.ssm_b_norm && layer.ssm_c_norm)) {
  8513. dt = build_norm(dt, layer.ssm_dt_norm, NULL, LLM_NORM_RMS, il);
  8514. B = build_norm(B, layer.ssm_b_norm, NULL, LLM_NORM_RMS, il);
  8515. C = build_norm(C, layer.ssm_c_norm, NULL, LLM_NORM_RMS, il);
  8516. }
  8517. // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
  8518. dt = build_lora_mm(layer.ssm_dt, dt);
  8519. dt = ggml_add(ctx0, dt, layer.ssm_dt_b);
  8520. cur = x;
  8521. x = ggml_reshape_4d(ctx0, x, head_dim, n_head, n_seq_tokens, n_seqs);
  8522. ggml_tensor * A = layer.ssm_a;
  8523. // use the states and the indices provided by build_recurrent_state
  8524. // (this is necessary in order to properly use the states before they are overwritten,
  8525. // while avoiding to make unnecessary copies of the states)
  8526. auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
  8527. ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
  8528. // Custom operator to optimize the parallel associative scan
  8529. // as described in the Annex D of the Mamba paper.
  8530. // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
  8531. return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
  8532. };
  8533. ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
  8534. // store last states
  8535. ggml_build_forward_expand(gf,
  8536. ggml_cpy(ctx0,
  8537. ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]),
  8538. ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
  8539. ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[2], x->nb[3], 0);
  8540. // TODO: skip computing output earlier for unused tokens
  8541. y = ggml_add(ctx0, y, ggml_mul(ctx0, cur, layer.ssm_d));
  8542. y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
  8543. // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
  8544. cur = build_lora_mm(layer.ssm_out, y);
  8545. }
  8546. // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
  8547. cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
  8548. return cur;
  8549. }
  8550. ggml_tensor * build_mamba2_layer(
  8551. llm_graph_input_rs * inp,
  8552. ggml_tensor * cur,
  8553. const llama_model & model,
  8554. const llama_ubatch & ubatch,
  8555. int il) const {
  8556. const auto * mctx_cur = inp->mctx;
  8557. const auto kv_head = mctx_cur->get_head();
  8558. const int64_t d_conv = hparams.ssm_d_conv;
  8559. const int64_t d_inner = hparams.ssm_d_inner;
  8560. const int64_t d_state = hparams.ssm_d_state;
  8561. const int64_t n_head = hparams.ssm_dt_rank;
  8562. const int64_t head_dim = d_inner / n_head;
  8563. const int64_t n_group = hparams.ssm_n_group;
  8564. const int64_t n_seqs = ubatch.n_seqs;
  8565. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  8566. GGML_ASSERT(n_seqs != 0);
  8567. GGML_ASSERT(ubatch.equal_seqs());
  8568. GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
  8569. ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
  8570. ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
  8571. ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
  8572. conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
  8573. // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
  8574. cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
  8575. // d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
  8576. // {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
  8577. ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
  8578. // split the above in three
  8579. ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*zxBCdt->nb[0], zxBCdt->nb[1], zxBCdt->nb[2], 0);
  8580. ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_inner + 2*n_group*d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], d_inner*ggml_element_size(zxBCdt));
  8581. ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], (2*d_inner + 2*n_group*d_state)*ggml_element_size(zxBCdt));
  8582. // conv
  8583. {
  8584. // => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs}
  8585. ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0);
  8586. // copy last (d_conv - 1) columns back into the state cache
  8587. ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
  8588. ggml_build_forward_expand(gf,
  8589. ggml_cpy(ctx0, last_conv,
  8590. ggml_view_1d(ctx0, conv_states_all,
  8591. (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
  8592. kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
  8593. // 1D convolution
  8594. // The equivalent is to make a self-overlapping view of conv_x
  8595. // over d_conv columns at each stride in the 3rd dimension,
  8596. // then element-wise multiply that with the conv1d weight,
  8597. // then sum the elements of each row,
  8598. // (the last two steps are a dot product over rows (also doable with mul_mat))
  8599. // then permute away the ne[0] dimension,
  8600. // and then you're left with the resulting x tensor.
  8601. // For simultaneous sequences, all sequences need to have the same length.
  8602. xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
  8603. // bias
  8604. xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b);
  8605. xBC = ggml_silu(ctx0, xBC);
  8606. }
  8607. // ssm
  8608. {
  8609. // These correspond to V K Q in SSM/attention duality
  8610. ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*xBC->nb[0], xBC->nb[1], xBC->nb[2], 0);
  8611. ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], d_inner*ggml_element_size(xBC));
  8612. ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], (d_inner + n_group*d_state)*ggml_element_size(xBC));
  8613. // {n_head, n_seq_tokens, n_seqs}
  8614. dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b);
  8615. ggml_tensor * A = model.layers[il].ssm_a;
  8616. // use the states and the indices provided by build_recurrent_state
  8617. // (this is necessary in order to properly use the states before they are overwritten,
  8618. // while avoiding to make unnecessary copies of the states)
  8619. auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
  8620. ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
  8621. // TODO: use semistructured matrices to implement state-space duality
  8622. // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
  8623. return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
  8624. };
  8625. ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
  8626. // store last states
  8627. ggml_build_forward_expand(gf,
  8628. ggml_cpy(ctx0,
  8629. ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, ggml_nelements(x)*x->nb[0]),
  8630. ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
  8631. ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head*x->nb[1], n_seq_tokens*n_head*x->nb[1], 0);
  8632. // TODO: skip computing output earlier for unused tokens
  8633. y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
  8634. y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
  8635. // grouped RMS norm
  8636. if (model.layers[il].ssm_norm) {
  8637. y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
  8638. y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
  8639. }
  8640. y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
  8641. // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
  8642. cur = build_lora_mm(model.layers[il].ssm_out, y);
  8643. }
  8644. // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
  8645. cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
  8646. cb(cur, "mamba_out", il);
  8647. return cur;
  8648. }
  8649. };
  8650. struct llm_build_mamba : public llm_graph_context_mamba {
  8651. llm_build_mamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
  8652. ggml_tensor * cur;
  8653. ggml_tensor * inpL;
  8654. // {n_embd, n_tokens}
  8655. inpL = build_inp_embd(model.tok_embd);
  8656. auto * rs_inp = build_rs_inp();
  8657. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8658. for (int il = 0; il < n_layer; ++il) {
  8659. // norm
  8660. cur = build_norm(inpL,
  8661. model.layers[il].attn_norm, NULL,
  8662. LLM_NORM_RMS, il);
  8663. cb(cur, "attn_norm", il);
  8664. if (model.arch == LLM_ARCH_MAMBA2) {
  8665. cur = build_mamba2_layer(rs_inp, cur, model, ubatch, il);
  8666. } else {
  8667. cur = build_mamba_layer(rs_inp, cur, model, ubatch, il);
  8668. }
  8669. if (il == n_layer - 1 && inp_out_ids) {
  8670. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8671. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  8672. }
  8673. // residual
  8674. cur = ggml_add(ctx0, cur, inpL);
  8675. cur = build_cvec(cur, il);
  8676. cb(cur, "l_out", il);
  8677. // input for next layer
  8678. inpL = cur;
  8679. }
  8680. // final rmsnorm
  8681. cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
  8682. cb(cur, "result_norm", -1);
  8683. res->t_embd = cur;
  8684. // lm_head
  8685. cur = build_lora_mm(model.output, cur);
  8686. cb(cur, "result_output", -1);
  8687. res->t_logits = cur;
  8688. ggml_build_forward_expand(gf, cur);
  8689. }
  8690. };
  8691. struct llm_build_jamba : public llm_graph_context_mamba {
  8692. llm_build_jamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
  8693. const int64_t n_embd_head = hparams.n_embd_head_v;
  8694. ggml_tensor * cur;
  8695. ggml_tensor * inpL;
  8696. // {n_embd, n_tokens}
  8697. inpL = build_inp_embd(model.tok_embd);
  8698. auto * inp_hybrid = build_inp_mem_hybrid();
  8699. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8700. for (int il = 0; il < n_layer; ++il) {
  8701. const int64_t n_head_kv = hparams.n_head_kv(il);
  8702. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  8703. cb(cur, "attn_norm", il);
  8704. if (n_head_kv == 0) {
  8705. cur = build_mamba_layer(inp_hybrid->get_recr(), cur, model, ubatch, il);
  8706. } else {
  8707. // Attention
  8708. struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8709. struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8710. struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8711. cb(Qcur, "Qcur", il);
  8712. cb(Kcur, "Kcur", il);
  8713. cb(Vcur, "Vcur", il);
  8714. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8715. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8716. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8717. cb(Qcur, "Qcur", il);
  8718. cb(Kcur, "Kcur", il);
  8719. cb(Vcur, "Vcur", il);
  8720. // No RoPE :)
  8721. cur = build_attn(inp_hybrid->get_attn(), model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
  8722. }
  8723. if (il == n_layer - 1 && inp_out_ids) {
  8724. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8725. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  8726. }
  8727. // residual
  8728. struct ggml_tensor * ffn_inp = ggml_add(ctx0, inpL, cur);
  8729. cb(cur, "ffn_inp", il);
  8730. cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
  8731. cb(cur, "ffn_norm", il);
  8732. // feed-forward network
  8733. if (model.layers[il].ffn_gate_inp == nullptr) {
  8734. // FFN
  8735. cur = build_ffn(cur,
  8736. model.layers[il].ffn_up, NULL, NULL,
  8737. model.layers[il].ffn_gate, NULL, NULL,
  8738. model.layers[il].ffn_down, NULL, NULL,
  8739. NULL,
  8740. LLM_FFN_SILU, LLM_FFN_PAR, il);
  8741. cb(cur, "ffn_out", il);
  8742. } else {
  8743. // MoE branch
  8744. cur = build_moe_ffn(cur,
  8745. model.layers[il].ffn_gate_inp,
  8746. model.layers[il].ffn_up_exps,
  8747. model.layers[il].ffn_gate_exps,
  8748. model.layers[il].ffn_down_exps,
  8749. nullptr,
  8750. n_expert, n_expert_used,
  8751. LLM_FFN_SILU, false,
  8752. false, 0.0,
  8753. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  8754. il);
  8755. cb(cur, "ffn_moe_out", il);
  8756. }
  8757. // residual
  8758. cur = ggml_add(ctx0, ffn_inp, cur);
  8759. cur = build_cvec(cur, il);
  8760. cb(cur, "l_out", il);
  8761. // input for next layer
  8762. inpL = cur;
  8763. }
  8764. // final rmsnorm
  8765. cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
  8766. cb(cur, "result_norm", -1);
  8767. res->t_embd = cur;
  8768. // lm_head
  8769. cur = build_lora_mm(model.output, cur);
  8770. cb(cur, "result_output", -1);
  8771. res->t_logits = cur;
  8772. ggml_build_forward_expand(gf, cur);
  8773. }
  8774. };
  8775. struct llm_build_command_r : public llm_graph_context {
  8776. llm_build_command_r(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  8777. const int64_t n_embd_head = hparams.n_embd_head_v;
  8778. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8779. const float f_logit_scale = hparams.f_logit_scale;
  8780. ggml_tensor * cur;
  8781. ggml_tensor * inpL;
  8782. inpL = build_inp_embd(model.tok_embd);
  8783. // inp_pos - contains the positions
  8784. ggml_tensor * inp_pos = build_inp_pos();
  8785. auto * inp_attn = build_attn_inp_kv_unified();
  8786. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8787. for (int il = 0; il < n_layer; ++il) {
  8788. // norm
  8789. cur = build_norm(inpL,
  8790. model.layers[il].attn_norm, NULL,
  8791. LLM_NORM, il);
  8792. cb(cur, "attn_norm", il);
  8793. ggml_tensor * ffn_inp = cur;
  8794. // self-attention
  8795. {
  8796. // compute Q and K and RoPE them
  8797. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8798. cb(Qcur, "Qcur", il);
  8799. if (model.layers[il].bq) {
  8800. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  8801. cb(Qcur, "Qcur", il);
  8802. }
  8803. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8804. cb(Kcur, "Kcur", il);
  8805. if (model.layers[il].bk) {
  8806. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  8807. cb(Kcur, "Kcur", il);
  8808. }
  8809. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8810. cb(Vcur, "Vcur", il);
  8811. if (model.layers[il].bv) {
  8812. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  8813. cb(Vcur, "Vcur", il);
  8814. }
  8815. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8816. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8817. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8818. if (model.layers[il].attn_q_norm) {
  8819. Qcur = build_norm(Qcur,
  8820. model.layers[il].attn_q_norm,
  8821. NULL,
  8822. LLM_NORM, il);
  8823. cb(Qcur, "Qcur", il);
  8824. }
  8825. Qcur = ggml_rope_ext(
  8826. ctx0, Qcur, inp_pos, nullptr,
  8827. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8828. ext_factor, attn_factor, beta_fast, beta_slow
  8829. );
  8830. if (model.layers[il].attn_k_norm) {
  8831. Kcur = build_norm(Kcur,
  8832. model.layers[il].attn_k_norm,
  8833. NULL,
  8834. LLM_NORM, il);
  8835. cb(Kcur, "Kcur", il);
  8836. }
  8837. Kcur = ggml_rope_ext(
  8838. ctx0, Kcur, inp_pos, nullptr,
  8839. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8840. ext_factor, attn_factor, beta_fast, beta_slow
  8841. );
  8842. cb(Qcur, "Qcur", il);
  8843. cb(Kcur, "Kcur", il);
  8844. cb(Vcur, "Vcur", il);
  8845. cur = build_attn(inp_attn,
  8846. model.layers[il].wo, model.layers[il].bo,
  8847. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  8848. }
  8849. if (il == n_layer - 1 && inp_out_ids) {
  8850. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8851. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  8852. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  8853. }
  8854. ggml_tensor * attn_out = cur;
  8855. // feed-forward network
  8856. {
  8857. cur = build_ffn(ffn_inp,
  8858. model.layers[il].ffn_up, NULL, NULL,
  8859. model.layers[il].ffn_gate, NULL, NULL,
  8860. model.layers[il].ffn_down, NULL, NULL,
  8861. NULL,
  8862. LLM_FFN_SILU, LLM_FFN_PAR, il);
  8863. cb(cur, "ffn_out", il);
  8864. }
  8865. // add together residual + FFN + self-attention
  8866. cur = ggml_add(ctx0, cur, inpL);
  8867. cur = ggml_add(ctx0, cur, attn_out);
  8868. cur = build_cvec(cur, il);
  8869. cb(cur, "l_out", il);
  8870. // input for next layer
  8871. inpL = cur;
  8872. }
  8873. cur = inpL;
  8874. cur = build_norm(cur,
  8875. model.output_norm, NULL,
  8876. LLM_NORM, -1);
  8877. cb(cur, "result_norm", -1);
  8878. res->t_embd = cur;
  8879. // lm_head
  8880. cur = build_lora_mm(model.output, cur);
  8881. if (f_logit_scale) {
  8882. cur = ggml_scale(ctx0, cur, f_logit_scale);
  8883. }
  8884. cb(cur, "result_output", -1);
  8885. res->t_logits = cur;
  8886. ggml_build_forward_expand(gf, cur);
  8887. }
  8888. };
  8889. struct llm_build_cohere2_iswa : public llm_graph_context {
  8890. llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  8891. const int64_t n_embd_head = hparams.n_embd_head_v;
  8892. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8893. const float f_logit_scale = hparams.f_logit_scale;
  8894. ggml_tensor * cur;
  8895. ggml_tensor * inpL;
  8896. inpL = build_inp_embd(model.tok_embd);
  8897. // inp_pos - contains the positions
  8898. ggml_tensor * inp_pos = build_inp_pos();
  8899. auto * inp_attn = build_attn_inp_kv_unified_iswa();
  8900. ggml_tensor * inp_out_ids = build_inp_out_ids();
  8901. for (int il = 0; il < n_layer; ++il) {
  8902. const bool is_swa = hparams.is_swa(il);
  8903. // norm
  8904. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
  8905. cb(cur, "attn_norm", il);
  8906. ggml_tensor * ffn_inp = cur;
  8907. // self-attention
  8908. {
  8909. // rope freq factors for 128k context
  8910. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  8911. // compute Q and K and RoPE them
  8912. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  8913. cb(Qcur, "Qcur", il);
  8914. if (model.layers[il].bq) {
  8915. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  8916. cb(Qcur, "Qcur", il);
  8917. }
  8918. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  8919. cb(Kcur, "Kcur", il);
  8920. if (model.layers[il].bk) {
  8921. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  8922. cb(Kcur, "Kcur", il);
  8923. }
  8924. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  8925. cb(Vcur, "Vcur", il);
  8926. if (model.layers[il].bv) {
  8927. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  8928. cb(Vcur, "Vcur", il);
  8929. }
  8930. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8931. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  8932. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  8933. if (is_swa) {
  8934. Qcur = ggml_rope_ext(
  8935. ctx0, Qcur, inp_pos, rope_factors,
  8936. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8937. ext_factor, attn_factor, beta_fast, beta_slow
  8938. );
  8939. Kcur = ggml_rope_ext(
  8940. ctx0, Kcur, inp_pos, rope_factors,
  8941. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8942. ext_factor, attn_factor, beta_fast, beta_slow
  8943. );
  8944. }
  8945. cb(Qcur, "Qcur", il);
  8946. cb(Kcur, "Kcur", il);
  8947. cb(Vcur, "Vcur", il);
  8948. cur = build_attn(inp_attn,
  8949. model.layers[il].wo, model.layers[il].bo,
  8950. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  8951. }
  8952. if (il == n_layer - 1 && inp_out_ids) {
  8953. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8954. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  8955. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  8956. }
  8957. ggml_tensor * attn_out = cur;
  8958. // feed-forward network
  8959. {
  8960. cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
  8961. NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR,
  8962. il);
  8963. cb(cur, "ffn_out", il);
  8964. }
  8965. // add together residual + FFN + self-attention
  8966. cur = ggml_add(ctx0, cur, inpL);
  8967. cur = ggml_add(ctx0, cur, attn_out);
  8968. cur = build_cvec(cur, il);
  8969. cb(cur, "l_out", il);
  8970. // input for next layer
  8971. inpL = cur;
  8972. }
  8973. cur = inpL;
  8974. cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1);
  8975. cb(cur, "result_norm", -1);
  8976. res->t_embd = cur;
  8977. // lm_head
  8978. cur = build_lora_mm(model.output, cur);
  8979. if (f_logit_scale) {
  8980. cur = ggml_scale(ctx0, cur, f_logit_scale);
  8981. }
  8982. cb(cur, "result_output", -1);
  8983. res->t_logits = cur;
  8984. ggml_build_forward_expand(gf, cur);
  8985. }
  8986. };
  8987. // ref: https://allenai.org/olmo
  8988. // based on the original build_llama() function, changes:
  8989. // * non-parametric layer norm
  8990. // * clamp qkv
  8991. // * removed bias
  8992. // * removed MoE
  8993. struct llm_build_olmo : public llm_graph_context {
  8994. llm_build_olmo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  8995. const int64_t n_embd_head = hparams.n_embd_head_v;
  8996. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8997. GGML_ASSERT(n_embd_head == hparams.n_rot);
  8998. ggml_tensor * cur;
  8999. ggml_tensor * inpL;
  9000. inpL = build_inp_embd(model.tok_embd);
  9001. // inp_pos - contains the positions
  9002. ggml_tensor * inp_pos = build_inp_pos();
  9003. auto * inp_attn = build_attn_inp_kv_unified();
  9004. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9005. for (int il = 0; il < n_layer; ++il) {
  9006. ggml_tensor * inpSA = inpL;
  9007. // norm
  9008. cur = build_norm(inpL,
  9009. NULL, NULL,
  9010. LLM_NORM, il);
  9011. cb(cur, "attn_norm", il);
  9012. // self-attention
  9013. {
  9014. // compute Q and K and RoPE them
  9015. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  9016. cb(Qcur, "Qcur", il);
  9017. if (hparams.f_clamp_kqv > 0.0f) {
  9018. Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  9019. cb(Qcur, "Qcur", il);
  9020. }
  9021. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  9022. cb(Kcur, "Kcur", il);
  9023. if (hparams.f_clamp_kqv > 0.0f) {
  9024. Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  9025. cb(Kcur, "Kcur", il);
  9026. }
  9027. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  9028. cb(Vcur, "Vcur", il);
  9029. if (hparams.f_clamp_kqv > 0.0f) {
  9030. Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  9031. cb(Vcur, "Vcur", il);
  9032. }
  9033. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9034. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9035. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9036. Qcur = ggml_rope_ext(
  9037. ctx0, Qcur, inp_pos, nullptr,
  9038. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9039. ext_factor, attn_factor, beta_fast, beta_slow
  9040. );
  9041. Kcur = ggml_rope_ext(
  9042. ctx0, Kcur, inp_pos, nullptr,
  9043. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9044. ext_factor, attn_factor, beta_fast, beta_slow
  9045. );
  9046. cb(Qcur, "Qcur", il);
  9047. cb(Kcur, "Kcur", il);
  9048. cb(Vcur, "Vcur", il);
  9049. cur = build_attn(inp_attn,
  9050. model.layers[il].wo, nullptr,
  9051. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  9052. }
  9053. if (il == n_layer - 1 && inp_out_ids) {
  9054. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9055. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9056. }
  9057. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9058. cb(ffn_inp, "ffn_inp", il);
  9059. // feed-forward network
  9060. cur = build_norm(ffn_inp,
  9061. NULL, NULL,
  9062. LLM_NORM, il);
  9063. cb(cur, "ffn_norm", il);
  9064. cur = build_ffn(cur,
  9065. model.layers[il].ffn_up, NULL, NULL,
  9066. model.layers[il].ffn_gate, NULL, NULL,
  9067. model.layers[il].ffn_down, NULL, NULL,
  9068. NULL,
  9069. LLM_FFN_SILU, LLM_FFN_PAR, il);
  9070. cb(cur, "ffn_out", il);
  9071. cur = ggml_add(ctx0, cur, ffn_inp);
  9072. cb(cur, "ffn_out", il);
  9073. cur = build_cvec(cur, il);
  9074. cb(cur, "l_out", il);
  9075. // input for next layer
  9076. inpL = cur;
  9077. }
  9078. cur = inpL;
  9079. cur = build_norm(cur,
  9080. NULL, NULL,
  9081. LLM_NORM, -1);
  9082. cb(cur, "result_norm", -1);
  9083. res->t_embd = cur;
  9084. // lm_head
  9085. cur = build_lora_mm(model.output, cur);
  9086. cb(cur, "result_output", -1);
  9087. res->t_logits = cur;
  9088. ggml_build_forward_expand(gf, cur);
  9089. }
  9090. };
  9091. struct llm_build_olmo2 : public llm_graph_context {
  9092. llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  9093. const int64_t n_embd_head = hparams.n_embd_head_v;
  9094. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9095. GGML_ASSERT(n_embd_head == hparams.n_rot);
  9096. ggml_tensor * cur;
  9097. ggml_tensor * inpL;
  9098. inpL = build_inp_embd(model.tok_embd);
  9099. // inp_pos - contains the positions
  9100. ggml_tensor * inp_pos = build_inp_pos();
  9101. auto * inp_attn = build_attn_inp_kv_unified();
  9102. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9103. for (int il = 0; il < n_layer; ++il) {
  9104. ggml_tensor * inpSA = inpL;
  9105. cur = inpL;
  9106. // self_attention
  9107. {
  9108. // compute Q and K and RoPE them
  9109. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  9110. cb(Qcur, "Qcur", il);
  9111. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  9112. cb(Kcur, "Kcur", il);
  9113. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  9114. cb(Vcur, "Vcur", il);
  9115. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
  9116. LLM_NORM_RMS, il);
  9117. cb(Qcur, "Qcur_normed", il);
  9118. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
  9119. LLM_NORM_RMS, il);
  9120. cb(Kcur, "Kcur_normed", il);
  9121. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9122. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9123. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9124. Qcur = ggml_rope_ext(
  9125. ctx0, Qcur, inp_pos, nullptr,
  9126. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9127. ext_factor, attn_factor, beta_fast, beta_slow
  9128. );
  9129. Kcur = ggml_rope_ext(
  9130. ctx0, Kcur, inp_pos, nullptr,
  9131. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9132. ext_factor, attn_factor, beta_fast, beta_slow
  9133. );
  9134. cb(Qcur, "Qcur", il);
  9135. cb(Kcur, "Kcur", il);
  9136. cb(Vcur, "Vcur", il);
  9137. cur = build_attn(inp_attn,
  9138. model.layers[il].wo, NULL,
  9139. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  9140. }
  9141. if (il == n_layer - 1 && inp_out_ids) {
  9142. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9143. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9144. }
  9145. cur = build_norm(cur,
  9146. model.layers[il].attn_post_norm, NULL,
  9147. LLM_NORM_RMS, il);
  9148. cb(cur, "attn_post_norm", il);
  9149. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9150. cb(ffn_inp, "ffn_inp", il);
  9151. // feed-forward network
  9152. cur = build_ffn(ffn_inp,
  9153. model.layers[il].ffn_up, NULL, NULL,
  9154. model.layers[il].ffn_gate, NULL, NULL,
  9155. model.layers[il].ffn_down, NULL, NULL,
  9156. NULL,
  9157. LLM_FFN_SILU, LLM_FFN_PAR, il);
  9158. cb(cur, "ffn_out", il);
  9159. cur = build_norm(cur,
  9160. model.layers[il].ffn_post_norm, NULL,
  9161. LLM_NORM_RMS, -1);
  9162. cb(cur, "ffn_post_norm", -1);
  9163. cur = ggml_add(ctx0, cur, ffn_inp);
  9164. cb(cur, "ffn_out", il);
  9165. cur = build_cvec(cur, il);
  9166. cb(cur, "l_out", il);
  9167. // input for next layer
  9168. inpL = cur;
  9169. }
  9170. cur = inpL;
  9171. cur = build_norm(cur,
  9172. model.output_norm, NULL,
  9173. LLM_NORM_RMS, -1);
  9174. cb(cur, "result_norm", -1);
  9175. res->t_embd = cur;
  9176. // lm_head
  9177. cur = build_lora_mm(model.output, cur);
  9178. cb(cur, "result_output", -1);
  9179. res->t_logits = cur;
  9180. ggml_build_forward_expand(gf, cur);
  9181. }
  9182. };
  9183. // based on the build_qwen2moe() function, changes:
  9184. // * removed shared experts
  9185. // * removed bias
  9186. // * added q, k norm
  9187. struct llm_build_olmoe : public llm_graph_context {
  9188. llm_build_olmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  9189. const int64_t n_embd_head = hparams.n_embd_head_v;
  9190. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9191. GGML_ASSERT(n_embd_head == hparams.n_rot);
  9192. ggml_tensor * cur;
  9193. ggml_tensor * inpL;
  9194. inpL = build_inp_embd(model.tok_embd);
  9195. // inp_pos - contains the positions
  9196. ggml_tensor * inp_pos = build_inp_pos();
  9197. auto * inp_attn = build_attn_inp_kv_unified();
  9198. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9199. for (int il = 0; il < n_layer; ++il) {
  9200. ggml_tensor * inpSA = inpL;
  9201. // norm
  9202. cur = build_norm(inpL,
  9203. model.layers[il].attn_norm, NULL,
  9204. LLM_NORM_RMS, il);
  9205. cb(cur, "attn_norm", il);
  9206. // self_attention
  9207. {
  9208. // compute Q and K and RoPE them
  9209. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  9210. cb(Qcur, "Qcur", il);
  9211. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  9212. cb(Kcur, "Kcur", il);
  9213. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  9214. cb(Vcur, "Vcur", il);
  9215. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
  9216. LLM_NORM_RMS, il);
  9217. cb(Qcur, "Qcur_normed", il);
  9218. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
  9219. LLM_NORM_RMS, il);
  9220. cb(Kcur, "Kcur_normed", il);
  9221. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9222. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9223. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9224. Qcur = ggml_rope_ext(
  9225. ctx0, Qcur, inp_pos, nullptr,
  9226. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9227. ext_factor, attn_factor, beta_fast, beta_slow
  9228. );
  9229. Kcur = ggml_rope_ext(
  9230. ctx0, Kcur, inp_pos, nullptr,
  9231. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9232. ext_factor, attn_factor, beta_fast, beta_slow
  9233. );
  9234. cb(Qcur, "Qcur", il);
  9235. cb(Kcur, "Kcur", il);
  9236. cb(Vcur, "Vcur", il);
  9237. cur = build_attn(inp_attn,
  9238. model.layers[il].wo, NULL,
  9239. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  9240. }
  9241. if (il == n_layer - 1 && inp_out_ids) {
  9242. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9243. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9244. }
  9245. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9246. cb(ffn_inp, "ffn_inp", il);
  9247. // MoE branch
  9248. cur = build_norm(ffn_inp,
  9249. model.layers[il].ffn_norm, NULL,
  9250. LLM_NORM_RMS, il);
  9251. cb(cur, "ffn_norm", il);
  9252. cur = build_moe_ffn(cur,
  9253. model.layers[il].ffn_gate_inp,
  9254. model.layers[il].ffn_up_exps,
  9255. model.layers[il].ffn_gate_exps,
  9256. model.layers[il].ffn_down_exps,
  9257. nullptr,
  9258. n_expert, n_expert_used,
  9259. LLM_FFN_SILU, false,
  9260. false, 0.0,
  9261. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  9262. il);
  9263. cb(cur, "ffn_moe_out", il);
  9264. cur = ggml_add(ctx0, cur, ffn_inp);
  9265. cur = build_cvec(cur, il);
  9266. cb(cur, "l_out", il);
  9267. // input for next layer
  9268. inpL = cur;
  9269. }
  9270. cur = inpL;
  9271. cur = build_norm(cur,
  9272. model.output_norm, NULL,
  9273. LLM_NORM_RMS, -1);
  9274. cb(cur, "result_norm", -1);
  9275. res->t_embd = cur;
  9276. // lm_head
  9277. cur = build_lora_mm(model.output, cur);
  9278. cb(cur, "result_output", -1);
  9279. res->t_logits = cur;
  9280. ggml_build_forward_expand(gf, cur);
  9281. }
  9282. };
  9283. struct llm_build_openelm : public llm_graph_context {
  9284. llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  9285. const int64_t n_embd_head = hparams.n_embd_head_v;
  9286. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9287. ggml_tensor * cur;
  9288. ggml_tensor * inpL;
  9289. inpL = build_inp_embd(model.tok_embd);
  9290. // inp_pos - contains the positions
  9291. ggml_tensor * inp_pos = build_inp_pos();
  9292. auto * inp_attn = build_attn_inp_kv_unified();
  9293. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9294. for (int il = 0; il < n_layer; ++il) {
  9295. const int64_t n_head = hparams.n_head(il);
  9296. const int64_t n_head_kv = hparams.n_head_kv(il);
  9297. const int64_t n_head_qkv = 2*n_head_kv + n_head;
  9298. cur = inpL;
  9299. ggml_tensor * residual = cur;
  9300. // norm
  9301. cur = build_norm(inpL,
  9302. model.layers[il].attn_norm, NULL,
  9303. LLM_NORM_RMS, il);
  9304. cb(cur, "attn_norm", il);
  9305. // self-attention
  9306. {
  9307. cur = build_lora_mm(model.layers[il].wqkv, cur);
  9308. cb(cur, "wqkv", il);
  9309. cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
  9310. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0);
  9311. cb(Qcur, "Qcur", il);
  9312. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head);
  9313. cb(Kcur, "Kcur", il);
  9314. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
  9315. cb(Vcur, "Vcur", il);
  9316. Qcur = build_norm(Qcur,
  9317. model.layers[il].attn_q_norm, NULL,
  9318. LLM_NORM_RMS, il);
  9319. cb(Qcur, "Qcur", il);
  9320. Kcur = build_norm(Kcur,
  9321. model.layers[il].attn_k_norm, NULL,
  9322. LLM_NORM_RMS, il);
  9323. cb(Kcur, "Kcur", il);
  9324. Qcur = ggml_rope_ext(
  9325. ctx0, Qcur, inp_pos, NULL,
  9326. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9327. ext_factor, attn_factor, beta_fast, beta_slow
  9328. );
  9329. Kcur = ggml_rope_ext(
  9330. ctx0, Kcur, inp_pos, NULL,
  9331. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9332. ext_factor, attn_factor, beta_fast, beta_slow
  9333. );
  9334. cb(Qcur, "Qcur", il);
  9335. cb(Kcur, "Kcur", il);
  9336. cb(Qcur, "Vcur", il);
  9337. cur = build_attn(inp_attn,
  9338. model.layers[il].wo, NULL,
  9339. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  9340. }
  9341. if (il == n_layer - 1 && inp_out_ids) {
  9342. residual = ggml_get_rows(ctx0, residual, inp_out_ids);
  9343. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9344. }
  9345. ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
  9346. cb(ffn_inp, "ffn_inp", il);
  9347. // feed-forward network
  9348. {
  9349. cur = build_norm(ffn_inp,
  9350. model.layers[il].ffn_norm, NULL,
  9351. LLM_NORM_RMS, il);
  9352. cb(cur, "ffn_norm", il);
  9353. cur = build_ffn(cur,
  9354. model.layers[il].ffn_up, NULL, NULL,
  9355. model.layers[il].ffn_gate, NULL, NULL,
  9356. model.layers[il].ffn_down, NULL, NULL,
  9357. NULL,
  9358. LLM_FFN_SILU, LLM_FFN_PAR, il);
  9359. cb(cur, "ffn_out", il);
  9360. }
  9361. cur = ggml_add(ctx0, cur, ffn_inp);
  9362. cur = build_cvec(cur, il);
  9363. cb(cur, "l_out", il);
  9364. inpL = cur;
  9365. }
  9366. cur = inpL;
  9367. // norm
  9368. cur = build_norm(cur,
  9369. model.output_norm, NULL,
  9370. LLM_NORM_RMS, -1);
  9371. cb(cur, "result_norm", -1);
  9372. res->t_embd = cur;
  9373. cur = build_lora_mm(model.output, cur);
  9374. cb(cur, "result_output", -1);
  9375. res->t_logits = cur;
  9376. ggml_build_forward_expand(gf, cur);
  9377. }
  9378. };
  9379. struct llm_build_gptneox : public llm_graph_context {
  9380. llm_build_gptneox(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  9381. const int64_t n_embd_head = hparams.n_embd_head_v;
  9382. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  9383. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9384. ggml_tensor * cur;
  9385. ggml_tensor * inpL;
  9386. inpL = build_inp_embd(model.tok_embd);
  9387. // inp_pos - contains the positions
  9388. ggml_tensor * inp_pos = build_inp_pos();
  9389. auto * inp_attn = build_attn_inp_kv_unified();
  9390. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9391. for (int il = 0; il < n_layer; ++il) {
  9392. cur = build_norm(inpL,
  9393. model.layers[il].attn_norm,
  9394. model.layers[il].attn_norm_b,
  9395. LLM_NORM, il);
  9396. cb(cur, "attn_norm", il);
  9397. // self-attention
  9398. {
  9399. cur = build_lora_mm(model.layers[il].wqkv, cur);
  9400. cb(cur, "wqkv", il);
  9401. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  9402. cb(cur, "bqkv", il);
  9403. ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  9404. ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  9405. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  9406. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9407. Qcur = ggml_rope_ext(
  9408. ctx0, Qcur, inp_pos, nullptr,
  9409. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9410. ext_factor, attn_factor, beta_fast, beta_slow
  9411. );
  9412. Kcur = ggml_rope_ext(
  9413. ctx0, Kcur, inp_pos, nullptr,
  9414. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9415. ext_factor, attn_factor, beta_fast, beta_slow
  9416. );
  9417. cb(Qcur, "Qcur", il);
  9418. cb(Kcur, "Kcur", il);
  9419. cb(Vcur, "Vcur", il);
  9420. cur = build_attn(inp_attn,
  9421. model.layers[il].wo, model.layers[il].bo,
  9422. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  9423. }
  9424. if (il == n_layer - 1 && inp_out_ids) {
  9425. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9426. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  9427. }
  9428. // ffn
  9429. if (hparams.use_par_res) {
  9430. // attention and ffn are computed in parallel
  9431. // x = x + attn(ln1(x)) + ffn(ln2(x))
  9432. ggml_tensor * attn_out = cur;
  9433. cur = build_norm(inpL,
  9434. model.layers[il].ffn_norm,
  9435. model.layers[il].ffn_norm_b,
  9436. LLM_NORM, il);
  9437. cb(cur, "ffn_norm", il);
  9438. cur = build_ffn(cur,
  9439. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  9440. NULL, NULL, NULL,
  9441. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  9442. NULL,
  9443. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  9444. cb(cur, "ffn_out", il);
  9445. cur = ggml_add(ctx0, cur, inpL);
  9446. cb(cur, "ffn_out", il);
  9447. cur = ggml_add(ctx0, cur, attn_out);
  9448. cur = build_cvec(cur, il);
  9449. cb(cur, "l_out", il);
  9450. // input for next layer
  9451. inpL = cur;
  9452. } else {
  9453. // attention and ffn are computed sequentially
  9454. // x = x + attn(ln1(x))
  9455. // x = x + ffn(ln2(x))
  9456. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  9457. cb(ffn_inp, "ffn_inp", il);
  9458. cur = build_norm(ffn_inp,
  9459. model.layers[il].ffn_norm,
  9460. model.layers[il].ffn_norm_b,
  9461. LLM_NORM, il);
  9462. cb(cur, "ffn_norm", il);
  9463. cur = build_ffn(cur,
  9464. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  9465. NULL, NULL, NULL,
  9466. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  9467. NULL,
  9468. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  9469. cb(cur, "ffn_out", il);
  9470. cur = ggml_add(ctx0, cur, ffn_inp);
  9471. cur = build_cvec(cur, il);
  9472. cb(cur, "l_out", il);
  9473. // input for next layer
  9474. inpL = cur;
  9475. }
  9476. }
  9477. cur = build_norm(inpL,
  9478. model.output_norm,
  9479. model.output_norm_b,
  9480. LLM_NORM, -1);
  9481. cb(cur, "result_norm", -1);
  9482. res->t_embd = cur;
  9483. cur = build_lora_mm(model.output, cur);
  9484. cb(cur, "result_output", -1);
  9485. res->t_logits = cur;
  9486. ggml_build_forward_expand(gf, cur);
  9487. }
  9488. };
  9489. struct llm_build_arctic : public llm_graph_context {
  9490. llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  9491. const int64_t n_embd_head = hparams.n_embd_head_v;
  9492. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9493. GGML_ASSERT(n_embd_head == hparams.n_rot);
  9494. ggml_tensor * cur;
  9495. ggml_tensor * inpL;
  9496. inpL = build_inp_embd(model.tok_embd);
  9497. // inp_pos - contains the positions
  9498. ggml_tensor * inp_pos = build_inp_pos();
  9499. auto * inp_attn = build_attn_inp_kv_unified();
  9500. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9501. for (int il = 0; il < n_layer; ++il) {
  9502. ggml_tensor * inpSA = inpL;
  9503. // norm
  9504. cur = build_norm(inpL,
  9505. model.layers[il].attn_norm, NULL,
  9506. LLM_NORM_RMS, il);
  9507. cb(cur, "attn_norm", il);
  9508. // self-attention
  9509. {
  9510. // compute Q and K and RoPE them
  9511. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  9512. cb(Qcur, "Qcur", il);
  9513. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  9514. cb(Kcur, "Kcur", il);
  9515. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  9516. cb(Vcur, "Vcur", il);
  9517. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9518. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9519. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9520. Qcur = ggml_rope_ext(
  9521. ctx0, Qcur, inp_pos, nullptr,
  9522. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9523. ext_factor, attn_factor, beta_fast, beta_slow
  9524. );
  9525. Kcur = ggml_rope_ext(
  9526. ctx0, Kcur, inp_pos, nullptr,
  9527. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9528. ext_factor, attn_factor, beta_fast, beta_slow
  9529. );
  9530. cb(Qcur, "Qcur", il);
  9531. cb(Kcur, "Kcur", il);
  9532. cb(Vcur, "Vcur", il);
  9533. cur = build_attn(inp_attn,
  9534. model.layers[il].wo, NULL,
  9535. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  9536. }
  9537. if (il == n_layer - 1 && inp_out_ids) {
  9538. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9539. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9540. }
  9541. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9542. cb(ffn_inp, "ffn_inp", il);
  9543. // feed-forward network
  9544. cur = build_norm(ffn_inp,
  9545. model.layers[il].ffn_norm, NULL,
  9546. LLM_NORM_RMS, il);
  9547. cb(cur, "ffn_norm", il);
  9548. cur = build_ffn(cur,
  9549. model.layers[il].ffn_up, NULL, NULL,
  9550. model.layers[il].ffn_gate, NULL, NULL,
  9551. model.layers[il].ffn_down, NULL, NULL,
  9552. NULL,
  9553. LLM_FFN_SILU, LLM_FFN_PAR, il);
  9554. cb(cur, "ffn_out", il);
  9555. ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
  9556. cb(ffn_out, "ffn_out", il);
  9557. // MoE
  9558. cur = build_norm(inpSA,
  9559. model.layers[il].ffn_norm_exps, NULL,
  9560. LLM_NORM_RMS, il);
  9561. cb(cur, "ffn_norm_exps", il);
  9562. cur = build_moe_ffn(cur,
  9563. model.layers[il].ffn_gate_inp,
  9564. model.layers[il].ffn_up_exps,
  9565. model.layers[il].ffn_gate_exps,
  9566. model.layers[il].ffn_down_exps,
  9567. nullptr,
  9568. n_expert, n_expert_used,
  9569. LLM_FFN_SILU, true,
  9570. false, 0.0,
  9571. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  9572. il);
  9573. cb(cur, "ffn_moe_out", il);
  9574. cur = ggml_add(ctx0, cur, ffn_out);
  9575. cb(cur, "ffn_out", il);
  9576. cur = build_cvec(cur, il);
  9577. cb(cur, "l_out", il);
  9578. // input for next layer
  9579. inpL = cur;
  9580. }
  9581. cur = inpL;
  9582. cur = build_norm(cur,
  9583. model.output_norm, NULL,
  9584. LLM_NORM_RMS, -1);
  9585. cb(cur, "result_norm", -1);
  9586. res->t_embd = cur;
  9587. // lm_head
  9588. cur = build_lora_mm(model.output, cur);
  9589. cb(cur, "result_output", -1);
  9590. res->t_logits = cur;
  9591. ggml_build_forward_expand(gf, cur);
  9592. }
  9593. };
  9594. struct llm_build_deepseek : public llm_graph_context {
  9595. llm_build_deepseek(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  9596. const int64_t n_embd_head = hparams.n_embd_head_v;
  9597. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9598. GGML_ASSERT(n_embd_head == hparams.n_rot);
  9599. ggml_tensor * cur;
  9600. ggml_tensor * inpL;
  9601. inpL = build_inp_embd(model.tok_embd);
  9602. // inp_pos - contains the positions
  9603. ggml_tensor * inp_pos = build_inp_pos();
  9604. auto * inp_attn = build_attn_inp_kv_unified();
  9605. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  9606. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9607. for (int il = 0; il < n_layer; ++il) {
  9608. ggml_tensor * inpSA = inpL;
  9609. // norm
  9610. cur = build_norm(inpL,
  9611. model.layers[il].attn_norm, NULL,
  9612. LLM_NORM_RMS, il);
  9613. cb(cur, "attn_norm", il);
  9614. // self-attention
  9615. {
  9616. // rope freq factors for llama3; may return nullptr for llama2 and other models
  9617. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  9618. // compute Q and K and RoPE them
  9619. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  9620. cb(Qcur, "Qcur", il);
  9621. if (model.layers[il].bq) {
  9622. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  9623. cb(Qcur, "Qcur", il);
  9624. }
  9625. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  9626. cb(Kcur, "Kcur", il);
  9627. if (model.layers[il].bk) {
  9628. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  9629. cb(Kcur, "Kcur", il);
  9630. }
  9631. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  9632. cb(Vcur, "Vcur", il);
  9633. if (model.layers[il].bv) {
  9634. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  9635. cb(Vcur, "Vcur", il);
  9636. }
  9637. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9638. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9639. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9640. Qcur = ggml_rope_ext(
  9641. ctx0, Qcur, inp_pos, rope_factors,
  9642. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9643. ext_factor, attn_factor, beta_fast, beta_slow
  9644. );
  9645. Kcur = ggml_rope_ext(
  9646. ctx0, Kcur, inp_pos, rope_factors,
  9647. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9648. ext_factor, attn_factor, beta_fast, beta_slow
  9649. );
  9650. cb(Qcur, "Qcur", il);
  9651. cb(Kcur, "Kcur", il);
  9652. cb(Vcur, "Vcur", il);
  9653. cur = build_attn(inp_attn,
  9654. model.layers[il].wo, model.layers[il].bo,
  9655. Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
  9656. }
  9657. if (il == n_layer - 1 && inp_out_ids) {
  9658. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9659. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9660. }
  9661. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9662. cb(ffn_inp, "ffn_inp", il);
  9663. cur = build_norm(ffn_inp,
  9664. model.layers[il].ffn_norm, NULL,
  9665. LLM_NORM_RMS, il);
  9666. cb(cur, "ffn_norm", il);
  9667. if ((uint32_t) il < hparams.n_layer_dense_lead) {
  9668. cur = build_ffn(cur,
  9669. model.layers[il].ffn_up, NULL, NULL,
  9670. model.layers[il].ffn_gate, NULL, NULL,
  9671. model.layers[il].ffn_down, NULL, NULL,
  9672. NULL,
  9673. LLM_FFN_SILU, LLM_FFN_PAR, il);
  9674. cb(cur, "ffn_out", il);
  9675. } else {
  9676. // MoE branch
  9677. ggml_tensor * moe_out =
  9678. build_moe_ffn(cur,
  9679. model.layers[il].ffn_gate_inp,
  9680. model.layers[il].ffn_up_exps,
  9681. model.layers[il].ffn_gate_exps,
  9682. model.layers[il].ffn_down_exps,
  9683. nullptr,
  9684. n_expert, n_expert_used,
  9685. LLM_FFN_SILU, false,
  9686. false, hparams.expert_weights_scale,
  9687. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  9688. il);
  9689. cb(moe_out, "ffn_moe_out", il);
  9690. // FFN shared expert
  9691. {
  9692. ggml_tensor * ffn_shexp = build_ffn(cur,
  9693. model.layers[il].ffn_up_shexp, NULL, NULL,
  9694. model.layers[il].ffn_gate_shexp, NULL, NULL,
  9695. model.layers[il].ffn_down_shexp, NULL, NULL,
  9696. NULL,
  9697. LLM_FFN_SILU, LLM_FFN_PAR, il);
  9698. cb(ffn_shexp, "ffn_shexp", il);
  9699. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  9700. cb(cur, "ffn_out", il);
  9701. }
  9702. }
  9703. cur = ggml_add(ctx0, cur, ffn_inp);
  9704. cur = build_cvec(cur, il);
  9705. cb(cur, "l_out", il);
  9706. // input for next layer
  9707. inpL = cur;
  9708. }
  9709. cur = inpL;
  9710. cur = build_norm(cur,
  9711. model.output_norm, NULL,
  9712. LLM_NORM_RMS, -1);
  9713. cb(cur, "result_norm", -1);
  9714. res->t_embd = cur;
  9715. // lm_head
  9716. cur = build_lora_mm(model.output, cur);
  9717. cb(cur, "result_output", -1);
  9718. res->t_logits = cur;
  9719. ggml_build_forward_expand(gf, cur);
  9720. }
  9721. };
  9722. struct llm_build_deepseek2 : public llm_graph_context {
  9723. llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  9724. bool is_lite = (hparams.n_layer == 27);
  9725. const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
  9726. // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
  9727. const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
  9728. const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
  9729. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  9730. const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
  9731. const uint32_t kv_lora_rank = hparams.n_lora_kv;
  9732. // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
  9733. // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
  9734. const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
  9735. const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(n_embd_head_k));
  9736. const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
  9737. ggml_tensor * cur;
  9738. ggml_tensor * inpL;
  9739. // {n_embd, n_tokens}
  9740. inpL = build_inp_embd(model.tok_embd);
  9741. // inp_pos - contains the positions
  9742. ggml_tensor * inp_pos = build_inp_pos();
  9743. auto * inp_attn = build_attn_inp_kv_unified();
  9744. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9745. for (int il = 0; il < n_layer; ++il) {
  9746. ggml_tensor * inpSA = inpL;
  9747. // norm
  9748. cur = build_norm(inpL,
  9749. model.layers[il].attn_norm, NULL,
  9750. LLM_NORM_RMS, il);
  9751. cb(cur, "attn_norm", il);
  9752. // self_attention
  9753. {
  9754. ggml_tensor * q = NULL;
  9755. if (!is_lite) {
  9756. q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
  9757. cb(q, "q", il);
  9758. q = build_norm(q,
  9759. model.layers[il].attn_q_a_norm, nullptr,
  9760. LLM_NORM_RMS, il);
  9761. cb(q, "q", il);
  9762. q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
  9763. cb(q, "q", il);
  9764. } else {
  9765. q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
  9766. cb(q, "q", il);
  9767. }
  9768. // split into {n_embd_head_qk_nope, n_head, n_tokens}
  9769. ggml_tensor * q_nope = ggml_view_3d(ctx0, q,
  9770. n_embd_head_qk_nope, n_head, n_tokens,
  9771. ggml_row_size(q->type, n_embd_head_k),
  9772. ggml_row_size(q->type, n_embd_head_k) * n_head,
  9773. 0);
  9774. cb(q_nope, "q_nope", il);
  9775. // and {n_embd_head_qk_rope, n_head, n_tokens}
  9776. ggml_tensor * q_pe = ggml_view_3d(ctx0, q,
  9777. n_embd_head_qk_rope, n_head, n_tokens,
  9778. ggml_row_size(q->type, n_embd_head_k),
  9779. ggml_row_size(q->type, n_embd_head_k) * n_head,
  9780. ggml_row_size(q->type, n_embd_head_qk_nope));
  9781. cb(q_pe, "q_pe", il);
  9782. ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
  9783. cb(kv_cmpr_pe, "kv_cmpr_pe", il);
  9784. // split into {kv_lora_rank, n_tokens}
  9785. ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_cmpr_pe,
  9786. kv_lora_rank, n_tokens,
  9787. ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
  9788. 0);
  9789. cb(kv_cmpr, "kv_cmpr", il);
  9790. // and {n_embd_head_qk_rope, 1, n_tokens}
  9791. ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe,
  9792. n_embd_head_qk_rope, 1, n_tokens,
  9793. ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
  9794. ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
  9795. ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
  9796. cb(k_pe, "k_pe", il);
  9797. q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr,
  9798. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9799. ext_factor, attn_factor, beta_fast, beta_slow
  9800. );
  9801. cb(q_pe, "q_pe", il);
  9802. k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr,
  9803. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9804. ext_factor, attn_factor, beta_fast, beta_slow
  9805. );
  9806. cb(k_pe, "k_pe", il);
  9807. kv_cmpr = build_norm(kv_cmpr,
  9808. model.layers[il].attn_kv_a_norm, nullptr,
  9809. LLM_NORM_RMS, il);
  9810. cb(kv_cmpr, "kv_cmpr", il);
  9811. if (is_mla) {
  9812. // {n_embd_head_qk_nope, n_tokens, n_head}
  9813. q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
  9814. cb(q_nope, "q_nope_perm", il);
  9815. // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
  9816. ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
  9817. cb(q_nope_absorbed, "q_nope_absorbed", il);
  9818. // {kv_lora_rank, n_head, n_tokens}
  9819. q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
  9820. cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
  9821. // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
  9822. // note: rope must go first for in-place context shifting in build_rope_shift()
  9823. ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
  9824. cb(Qcur, "Qcur", il);
  9825. kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
  9826. cb(kv_cmpr, "kv_cmpr_reshape", il);
  9827. // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
  9828. ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
  9829. cb(Kcur, "Kcur", il);
  9830. // {kv_lora_rank, 1, n_tokens}
  9831. ggml_tensor * Vcur = kv_cmpr;
  9832. cb(Vcur, "Vcur", il);
  9833. // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
  9834. cur = build_attn(inp_attn,
  9835. model.layers[il].wo, NULL,
  9836. Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
  9837. } else {
  9838. ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
  9839. cb(kv, "kv", il);
  9840. // split into {n_embd_head_qk_nope, n_head, n_tokens}
  9841. ggml_tensor * k_nope = ggml_view_3d(ctx0, kv,
  9842. n_embd_head_qk_nope, n_head, n_tokens,
  9843. ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
  9844. ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
  9845. 0);
  9846. cb(k_nope, "k_nope_view", il);
  9847. // and {n_embd_head_v, n_head, n_tokens}
  9848. ggml_tensor * Vcur = ggml_view_3d(ctx0, kv,
  9849. n_embd_head_v, n_head, n_tokens,
  9850. ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
  9851. ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
  9852. ggml_row_size(kv->type, n_embd_head_qk_nope));
  9853. cb(Vcur, "Vcur_view", il);
  9854. Vcur = ggml_cont(ctx0, Vcur);
  9855. cb(Vcur, "Vcur_cont", il);
  9856. // note: rope must go first for in-place context shifting in build_rope_shift()
  9857. ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0);
  9858. cb(Qcur, "Qcur", il);
  9859. ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
  9860. cb(Kcur, "Kcur", il);
  9861. // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
  9862. cur = build_attn(inp_attn,
  9863. model.layers[il].wo, NULL,
  9864. Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
  9865. }
  9866. }
  9867. if (il == n_layer - 1 && inp_out_ids) {
  9868. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9869. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9870. }
  9871. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9872. cb(ffn_inp, "ffn_inp", il);
  9873. cur = build_norm(ffn_inp,
  9874. model.layers[il].ffn_norm, NULL,
  9875. LLM_NORM_RMS, il);
  9876. cb(cur, "ffn_norm", il);
  9877. if ((uint32_t) il < hparams.n_layer_dense_lead) {
  9878. cur = build_ffn(cur,
  9879. model.layers[il].ffn_up, NULL, NULL,
  9880. model.layers[il].ffn_gate, NULL, NULL,
  9881. model.layers[il].ffn_down, NULL, NULL,
  9882. NULL,
  9883. LLM_FFN_SILU, LLM_FFN_PAR, il);
  9884. cb(cur, "ffn_out", il);
  9885. } else {
  9886. // MoE branch
  9887. ggml_tensor * moe_out =
  9888. build_moe_ffn(cur,
  9889. model.layers[il].ffn_gate_inp,
  9890. model.layers[il].ffn_up_exps,
  9891. model.layers[il].ffn_gate_exps,
  9892. model.layers[il].ffn_down_exps,
  9893. model.layers[il].ffn_exp_probs_b,
  9894. n_expert, n_expert_used,
  9895. LLM_FFN_SILU, hparams.expert_weights_norm,
  9896. true, hparams.expert_weights_scale,
  9897. (llama_expert_gating_func_type) hparams.expert_gating_func,
  9898. il);
  9899. cb(moe_out, "ffn_moe_out", il);
  9900. // FFN shared expert
  9901. {
  9902. ggml_tensor * ffn_shexp = build_ffn(cur,
  9903. model.layers[il].ffn_up_shexp, NULL, NULL,
  9904. model.layers[il].ffn_gate_shexp, NULL, NULL,
  9905. model.layers[il].ffn_down_shexp, NULL, NULL,
  9906. NULL,
  9907. LLM_FFN_SILU, LLM_FFN_PAR, il);
  9908. cb(ffn_shexp, "ffn_shexp", il);
  9909. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  9910. cb(cur, "ffn_out", il);
  9911. }
  9912. }
  9913. cur = ggml_add(ctx0, cur, ffn_inp);
  9914. cur = build_cvec(cur, il);
  9915. cb(cur, "l_out", il);
  9916. // input for next layer
  9917. inpL = cur;
  9918. }
  9919. cur = inpL;
  9920. cur = build_norm(cur,
  9921. model.output_norm, NULL,
  9922. LLM_NORM_RMS, -1);
  9923. cb(cur, "result_norm", -1);
  9924. res->t_embd = cur;
  9925. // lm_head
  9926. cur = ggml_mul_mat(ctx0, model.output, cur);
  9927. cb(cur, "result_output", -1);
  9928. res->t_logits = cur;
  9929. ggml_build_forward_expand(gf, cur);
  9930. }
  9931. };
  9932. struct llm_build_bitnet : public llm_graph_context {
  9933. llm_build_bitnet(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  9934. const int64_t n_embd_head = hparams.n_embd_head_v;
  9935. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9936. ggml_tensor * cur;
  9937. ggml_tensor * inpL;
  9938. inpL = build_inp_embd(model.tok_embd);
  9939. // inp_pos - contains the positions
  9940. ggml_tensor * inp_pos = build_inp_pos();
  9941. auto * inp_attn = build_attn_inp_kv_unified();
  9942. ggml_tensor * inp_out_ids = build_inp_out_ids();
  9943. for (int il = 0; il < n_layer; ++il) {
  9944. ggml_tensor * inpSA = inpL;
  9945. cur = build_norm(inpL,
  9946. model.layers[il].attn_norm, NULL,
  9947. LLM_NORM_RMS, il);
  9948. cb(cur, "attn_norm", il);
  9949. // self-attention
  9950. {
  9951. // compute Q and K and RoPE them
  9952. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  9953. if (model.layers[il].wq_scale) {
  9954. Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
  9955. }
  9956. cb(Qcur, "Qcur", il);
  9957. if (model.layers[il].bq) {
  9958. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  9959. cb(Qcur, "Qcur", il);
  9960. }
  9961. // B1.K
  9962. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  9963. if (model.layers[il].wk_scale) {
  9964. Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
  9965. }
  9966. cb(Kcur, "Kcur", il);
  9967. if (model.layers[il].bk) {
  9968. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  9969. cb(Kcur, "Kcur", il);
  9970. }
  9971. // B1.V
  9972. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  9973. if (model.layers[il].wv_scale) {
  9974. Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
  9975. }
  9976. cb(Vcur, "Vcur", il);
  9977. if (model.layers[il].bv) {
  9978. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  9979. cb(Vcur, "Vcur", il);
  9980. }
  9981. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9982. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9983. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  9984. Qcur = ggml_rope_ext(
  9985. ctx0, Qcur, inp_pos, nullptr,
  9986. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9987. ext_factor, attn_factor, beta_fast, beta_slow
  9988. );
  9989. Kcur = ggml_rope_ext(
  9990. ctx0, Kcur, inp_pos, nullptr,
  9991. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9992. ext_factor, attn_factor, beta_fast, beta_slow
  9993. );
  9994. cb(Qcur, "Qcur", il);
  9995. cb(Kcur, "Kcur", il);
  9996. cb(Vcur, "Vcur", il);
  9997. cur = build_attn(inp_attn,
  9998. NULL, NULL,
  9999. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  10000. cur = build_norm(cur,
  10001. model.layers[il].attn_sub_norm, NULL,
  10002. LLM_NORM_RMS, il);
  10003. cb(cur, "attn_sub_norm", il);
  10004. cur = build_lora_mm(model.layers[il].wo, cur);
  10005. if (model.layers[il].wo_scale) {
  10006. cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
  10007. }
  10008. if (model.layers[il].bo) {
  10009. cur = ggml_add(ctx0, cur, model.layers[il].bo);
  10010. }
  10011. cb(cur, "attn_o_out", il);
  10012. }
  10013. if (il == n_layer - 1 && inp_out_ids) {
  10014. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10015. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10016. }
  10017. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10018. cb(ffn_inp, "ffn_inp", il);
  10019. // feed-forward forward
  10020. cur = build_norm(ffn_inp,
  10021. model.layers[il].ffn_norm, NULL,
  10022. LLM_NORM_RMS, il);
  10023. cb(cur, "ffn_norm", il);
  10024. cur = build_ffn(cur,
  10025. model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale,
  10026. model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
  10027. NULL, NULL, NULL,
  10028. NULL,
  10029. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10030. cb(cur, "ffn_sub_out", il);
  10031. cur = build_norm(cur,
  10032. model.layers[il].ffn_sub_norm, NULL,
  10033. LLM_NORM_RMS, il);
  10034. cb(cur, "ffn_sub_norm", il);
  10035. cur = build_lora_mm(model.layers[il].ffn_down, cur);
  10036. if (model.layers[il].ffn_down_scale) {
  10037. cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
  10038. }
  10039. cb(cur, "ffn_down", il);
  10040. cur = ggml_add(ctx0, cur, ffn_inp);
  10041. cb(cur, "l_out", il);
  10042. // input for next layer
  10043. inpL = cur;
  10044. }
  10045. cur = inpL;
  10046. cur = build_norm(cur,
  10047. model.output_norm, NULL,
  10048. LLM_NORM_RMS, -1);
  10049. cb(cur, "result_norm", -1);
  10050. res->t_embd = cur;
  10051. // lm_head
  10052. // FIXME: do not use model.tok_embd directly, duplicate as model.output
  10053. cur = build_lora_mm(model.tok_embd, cur);
  10054. cb(cur, "result_output", -1);
  10055. res->t_logits = cur;
  10056. ggml_build_forward_expand(gf, cur);
  10057. }
  10058. };
  10059. struct llm_build_t5_enc : public llm_graph_context {
  10060. llm_build_t5_enc(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10061. const int64_t n_embd_head = hparams.n_embd_head_v;
  10062. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10063. ggml_tensor * cur;
  10064. ggml_tensor * inpL;
  10065. inpL = build_inp_embd(model.tok_embd);
  10066. ggml_tensor * pos_bucket_enc = build_inp_pos_bucket_enc();
  10067. auto * inp_attn = build_attn_inp_no_cache();
  10068. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10069. for (int il = 0; il < n_layer; ++il) {
  10070. ggml_tensor * inpSA = inpL;
  10071. // norm
  10072. cur = build_norm(inpL,
  10073. model.layers[il].attn_norm_enc, NULL,
  10074. LLM_NORM_RMS, il);
  10075. cb(cur, "attn_norm", il);
  10076. // self-attention
  10077. {
  10078. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur);
  10079. cb(Qcur, "Qcur", il);
  10080. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur);
  10081. cb(Kcur, "Kcur", il);
  10082. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur);
  10083. cb(Vcur, "Vcur", il);
  10084. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10085. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10086. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  10087. ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
  10088. ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b);
  10089. cur = build_attn(inp_attn,
  10090. model.layers[il].wo_enc, nullptr,
  10091. Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
  10092. cb(cur, "kqv_out", il);
  10093. }
  10094. if (il == n_layer - 1 && inp_out_ids) {
  10095. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10096. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10097. }
  10098. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10099. cb(ffn_inp, "ffn_inp", il);
  10100. // feed-forward network
  10101. {
  10102. cur = build_norm(ffn_inp,
  10103. model.layers[il].ffn_norm_enc, NULL,
  10104. LLM_NORM_RMS, il);
  10105. cb(cur, "ffn_norm", il);
  10106. // T5 uses relu, flan-T5 uses gelu-gated
  10107. cur = build_ffn(cur,
  10108. model.layers[il].ffn_up_enc, NULL, NULL,
  10109. model.layers[il].ffn_gate_enc, NULL, NULL,
  10110. model.layers[il].ffn_down_enc, NULL, NULL,
  10111. NULL,
  10112. model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
  10113. model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
  10114. il);
  10115. cb(cur, "ffn_out", il);
  10116. }
  10117. cur = ggml_add(ctx0, cur, ffn_inp);
  10118. cb(cur, "ffn_out", il);
  10119. cur = build_cvec(cur, il);
  10120. cb(cur, "l_out", il);
  10121. // input for next layer
  10122. inpL = cur;
  10123. }
  10124. cur = inpL;
  10125. cb(cur, "result_embd", -1);
  10126. cur = build_norm(cur,
  10127. model.output_norm_enc, NULL,
  10128. LLM_NORM_RMS, -1);
  10129. cb(cur, "result_norm", -1);
  10130. res->t_embd = cur;
  10131. ggml_build_forward_expand(gf, cur);
  10132. }
  10133. };
  10134. struct llm_build_t5_dec : public llm_graph_context {
  10135. llm_build_t5_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10136. const int64_t n_embd_head = hparams.n_embd_head_v;
  10137. //const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  10138. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10139. ggml_tensor * cur;
  10140. ggml_tensor * inpL;
  10141. inpL = build_inp_embd(model.tok_embd);
  10142. ggml_tensor * embd_enc = build_inp_cross_embd();
  10143. ggml_tensor * pos_bucket_dec = build_inp_pos_bucket_dec();
  10144. const int64_t n_outputs_enc = embd_enc->ne[1];
  10145. auto * inp_attn_self = build_attn_inp_kv_unified();
  10146. auto * inp_attn_cross = build_attn_inp_cross();
  10147. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10148. for (int il = 0; il < n_layer; ++il) {
  10149. ggml_tensor * inpSA = inpL;
  10150. // norm
  10151. cur = build_norm(inpL,
  10152. model.layers[il].attn_norm, NULL,
  10153. LLM_NORM_RMS, il);
  10154. cb(cur, "attn_norm", il);
  10155. // self-attention
  10156. {
  10157. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  10158. cb(Qcur, "Qcur", il);
  10159. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  10160. cb(Kcur, "Kcur", il);
  10161. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  10162. cb(Vcur, "Vcur", il);
  10163. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10164. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10165. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  10166. ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
  10167. ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b);
  10168. cur = build_attn(inp_attn_self,
  10169. model.layers[il].wo, model.layers[il].bo,
  10170. Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
  10171. cb(cur, "kqv_out", il);
  10172. }
  10173. cur = ggml_add(ctx0, cur, inpSA);
  10174. cb(cur, "cross_inp", il);
  10175. ggml_tensor * inpCA = cur;
  10176. // norm
  10177. cur = build_norm(cur,
  10178. model.layers[il].attn_norm_cross, NULL,
  10179. LLM_NORM_RMS, il);
  10180. cb(cur, "attn_norm_cross", il);
  10181. // cross-attention
  10182. {
  10183. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur);
  10184. cb(Qcur, "Qcur", il);
  10185. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc);
  10186. cb(Kcur, "Kcur", il);
  10187. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc);
  10188. cb(Vcur, "Vcur", il);
  10189. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10190. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
  10191. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_outputs_enc);
  10192. cur = build_attn(inp_attn_cross,
  10193. model.layers[il].wo_cross, nullptr,
  10194. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
  10195. cb(cur, "kqv_out", il);
  10196. //ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
  10197. //ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
  10198. //ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
  10199. //cb(kq, "kq", il);
  10200. //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
  10201. //cb(kq, "kq_soft_max_ext", il);
  10202. //ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
  10203. //cb(v, "v", il);
  10204. //ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
  10205. //cb(kqv, "kqv", il);
  10206. //ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
  10207. //cb(kqv_merged, "kqv_merged", il);
  10208. //cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
  10209. //cb(cur, "kqv_merged_cont", il);
  10210. //ggml_build_forward_expand(gf, cur);
  10211. //cur = build_lora_mm(model.layers[il].wo_cross, cur);
  10212. //cb(cur, "kqv_out", il);
  10213. }
  10214. if (il == n_layer - 1 && inp_out_ids) {
  10215. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10216. inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
  10217. }
  10218. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
  10219. cb(ffn_inp, "ffn_inp", il);
  10220. // feed-forward network
  10221. {
  10222. cur = build_norm(ffn_inp,
  10223. model.layers[il].ffn_norm, NULL,
  10224. LLM_NORM_RMS, il);
  10225. cb(cur, "ffn_norm", il);
  10226. // T5 uses relu, flan-T5 uses gelu-gated
  10227. cur = build_ffn(cur,
  10228. model.layers[il].ffn_up, NULL, NULL,
  10229. model.layers[il].ffn_gate, NULL, NULL,
  10230. model.layers[il].ffn_down, NULL, NULL,
  10231. NULL,
  10232. model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
  10233. model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
  10234. il);
  10235. cb(cur, "ffn_out", il);
  10236. }
  10237. cur = ggml_add(ctx0, cur, ffn_inp);
  10238. cb(cur, "ffn_out", il);
  10239. cur = build_cvec(cur, il);
  10240. cb(cur, "l_out", il);
  10241. // input for next layer
  10242. inpL = cur;
  10243. }
  10244. cur = inpL;
  10245. cb(cur, "result_embd", -1);
  10246. cur = build_norm(cur,
  10247. model.output_norm, NULL,
  10248. LLM_NORM_RMS, -1);
  10249. cb(cur, "result_norm", -1);
  10250. res->t_embd = cur;
  10251. // lm_head
  10252. cur = build_lora_mm(model.output, cur);
  10253. cb(cur, "result_output", -1);
  10254. res->t_logits = cur;
  10255. ggml_build_forward_expand(gf, cur);
  10256. }
  10257. };
  10258. struct llm_build_jais : public llm_graph_context {
  10259. llm_build_jais(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10260. const int64_t n_embd_head = hparams.n_embd_head_v;
  10261. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  10262. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10263. ggml_tensor * cur;
  10264. ggml_tensor * inpL;
  10265. inpL = build_inp_embd(model.tok_embd);
  10266. auto * inp_attn = build_attn_inp_kv_unified();
  10267. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10268. for (int il = 0; il < n_layer; ++il) {
  10269. cur = build_norm(inpL,
  10270. model.layers[il].attn_norm,
  10271. model.layers[il].attn_norm_b,
  10272. LLM_NORM, il);
  10273. cb(cur, "attn_norm", il);
  10274. // self-attention
  10275. {
  10276. cur = build_lora_mm(model.layers[il].wqkv, cur);
  10277. cb(cur, "wqkv", il);
  10278. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  10279. cb(cur, "bqkv", il);
  10280. ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd)));
  10281. ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd)));
  10282. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa)));
  10283. cb(Qcur, "Qcur", il);
  10284. cb(Kcur, "Kcur", il);
  10285. cb(Vcur, "Vcur", il);
  10286. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10287. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10288. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  10289. cur = build_attn(inp_attn,
  10290. model.layers[il].wo, model.layers[il].bo,
  10291. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
  10292. }
  10293. if (il == n_layer - 1 && inp_out_ids) {
  10294. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10295. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  10296. }
  10297. // add the input
  10298. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  10299. cb(ffn_inp, "ffn_inp", il);
  10300. // FF
  10301. {
  10302. cur = build_norm(ffn_inp,
  10303. model.layers[il].ffn_norm,
  10304. model.layers[il].ffn_norm_b,
  10305. LLM_NORM, il);
  10306. cb(cur, "ffn_norm", il);
  10307. cur = build_ffn(cur,
  10308. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  10309. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  10310. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  10311. NULL,
  10312. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10313. cb(cur, "ffn_out", il);
  10314. }
  10315. inpL = ggml_add(ctx0, cur, ffn_inp);
  10316. cb(inpL, "l_out", il);
  10317. }
  10318. cur = build_norm(inpL,
  10319. model.output_norm,
  10320. model.output_norm_b,
  10321. LLM_NORM, -1);
  10322. cb(cur, "result_norm", -1);
  10323. res->t_embd = cur;
  10324. cur = build_lora_mm(model.output, cur);
  10325. cb(cur, "result_output", -1);
  10326. res->t_logits = cur;
  10327. ggml_build_forward_expand(gf, cur);
  10328. }
  10329. };
  10330. struct llm_build_chatglm : public llm_graph_context {
  10331. llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10332. const int64_t n_embd_head = hparams.n_embd_head_v;
  10333. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  10334. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10335. ggml_tensor * cur;
  10336. ggml_tensor * inpL;
  10337. inpL = build_inp_embd(model.tok_embd);
  10338. // inp_pos - contains the positions
  10339. ggml_tensor * inp_pos = build_inp_pos();
  10340. auto * inp_attn = build_attn_inp_kv_unified();
  10341. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10342. for (int il = 0; il < n_layer; ++il) {
  10343. ggml_tensor * inpSA = inpL;
  10344. cur = build_norm(inpL,
  10345. model.layers[il].attn_norm,
  10346. NULL,
  10347. LLM_NORM_RMS, il);
  10348. cb(cur, "attn_norm", il);
  10349. // self-attention
  10350. {
  10351. ggml_tensor * Qcur = nullptr;
  10352. ggml_tensor * Kcur = nullptr;
  10353. ggml_tensor * Vcur = nullptr;
  10354. if (model.layers[il].wqkv == nullptr) {
  10355. Qcur = build_lora_mm(model.layers[il].wq, cur);
  10356. if (model.layers[il].bq) {
  10357. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  10358. }
  10359. Kcur = build_lora_mm(model.layers[il].wk, cur);
  10360. if (model.layers[il].bk) {
  10361. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  10362. }
  10363. Vcur = build_lora_mm(model.layers[il].wv, cur);
  10364. if (model.layers[il].bv) {
  10365. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  10366. }
  10367. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10368. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10369. } else {
  10370. cur = build_lora_mm(model.layers[il].wqkv, cur);
  10371. cb(cur, "wqkv", il);
  10372. if (model.layers[il].bqkv) {
  10373. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  10374. cb(cur, "bqkv", il);
  10375. }
  10376. Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  10377. Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  10378. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  10379. }
  10380. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  10381. //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
  10382. Qcur = ggml_rope_ext(
  10383. ctx0, Qcur, inp_pos, nullptr,
  10384. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10385. ext_factor, attn_factor, beta_fast, beta_slow
  10386. );
  10387. Kcur = ggml_rope_ext(
  10388. ctx0, Kcur, inp_pos, nullptr,
  10389. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10390. ext_factor, attn_factor, beta_fast, beta_slow
  10391. );
  10392. cb(Qcur, "Qcur", il);
  10393. cb(Kcur, "Kcur", il);
  10394. cb(Vcur, "Vcur", il);
  10395. cur = build_attn(inp_attn,
  10396. model.layers[il].wo, NULL,
  10397. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  10398. }
  10399. if (il == n_layer - 1 && inp_out_ids) {
  10400. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10401. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10402. }
  10403. // Add the input
  10404. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10405. cb(ffn_inp, "ffn_inp", il);
  10406. // FF
  10407. {
  10408. cur = build_norm(ffn_inp,
  10409. model.layers[il].ffn_norm,
  10410. NULL,
  10411. LLM_NORM_RMS, il);
  10412. cb(cur, "ffn_norm", il);
  10413. cur = build_ffn(cur,
  10414. model.layers[il].ffn_up, NULL, NULL,
  10415. NULL, NULL, NULL,
  10416. model.layers[il].ffn_down, NULL, NULL,
  10417. NULL,
  10418. LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
  10419. cb(cur, "ffn_out", il);
  10420. }
  10421. inpL = ggml_add(ctx0, cur, ffn_inp);
  10422. cb(inpL, "l_out", il);
  10423. }
  10424. cur = build_norm(inpL,
  10425. model.output_norm,
  10426. NULL,
  10427. LLM_NORM_RMS, -1);
  10428. cb(cur, "result_norm", -1);
  10429. res->t_embd = cur;
  10430. cur = build_lora_mm(model.output, cur);
  10431. cb(cur, "result_output", -1);
  10432. res->t_logits = cur;
  10433. ggml_build_forward_expand(gf, cur);
  10434. }
  10435. };
  10436. struct llm_build_glm4 : public llm_graph_context {
  10437. llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10438. const int64_t n_embd_head = hparams.n_embd_head_v;
  10439. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  10440. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10441. ggml_tensor * cur;
  10442. ggml_tensor * inpL;
  10443. inpL = build_inp_embd(model.tok_embd);
  10444. // inp_pos - contains the positions
  10445. ggml_tensor * inp_pos = build_inp_pos();
  10446. auto * inp_attn = build_attn_inp_kv_unified();
  10447. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10448. for (int il = 0; il < n_layer; ++il) {
  10449. ggml_tensor * inpSA = inpL;
  10450. // Pre-attention norm
  10451. cur = build_norm(inpL,
  10452. model.layers[il].attn_norm,
  10453. NULL,
  10454. LLM_NORM_RMS, il);
  10455. cb(cur, "attn_norm", il);
  10456. // self-attention
  10457. {
  10458. ggml_tensor * Qcur = nullptr;
  10459. ggml_tensor * Kcur = nullptr;
  10460. ggml_tensor * Vcur = nullptr;
  10461. if (model.layers[il].wqkv == nullptr) {
  10462. Qcur = build_lora_mm(model.layers[il].wq, cur);
  10463. if (model.layers[il].bq) {
  10464. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  10465. }
  10466. Kcur = build_lora_mm(model.layers[il].wk, cur);
  10467. if (model.layers[il].bk) {
  10468. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  10469. }
  10470. Vcur = build_lora_mm(model.layers[il].wv, cur);
  10471. if (model.layers[il].bv) {
  10472. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  10473. }
  10474. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10475. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10476. } else {
  10477. cur = build_lora_mm(model.layers[il].wqkv, cur);
  10478. cb(cur, "wqkv", il);
  10479. if (model.layers[il].bqkv) {
  10480. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  10481. cb(cur, "bqkv", il);
  10482. }
  10483. Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
  10484. Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
  10485. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  10486. }
  10487. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  10488. Qcur = ggml_rope_ext(
  10489. ctx0, Qcur, inp_pos, nullptr,
  10490. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10491. ext_factor, attn_factor, beta_fast, beta_slow
  10492. );
  10493. Kcur = ggml_rope_ext(
  10494. ctx0, Kcur, inp_pos, nullptr,
  10495. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10496. ext_factor, attn_factor, beta_fast, beta_slow
  10497. );
  10498. cb(Qcur, "Qcur", il);
  10499. cb(Kcur, "Kcur", il);
  10500. cb(Vcur, "Vcur", il);
  10501. cur = build_attn(inp_attn,
  10502. model.layers[il].wo, NULL,
  10503. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  10504. }
  10505. if (il == n_layer - 1 && inp_out_ids) {
  10506. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10507. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10508. }
  10509. // Post-attention norm (new!)
  10510. cur = build_norm(cur,
  10511. model.layers[il].attn_post_norm,
  10512. NULL,
  10513. LLM_NORM_RMS, il);
  10514. cb(cur, "post_attn_norm", il);
  10515. // Add the input (residual connection after post-attention norm)
  10516. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10517. cb(ffn_inp, "ffn_inp", il);
  10518. // FF
  10519. {
  10520. // Pre-MLP norm
  10521. cur = build_norm(ffn_inp,
  10522. model.layers[il].ffn_norm,
  10523. NULL,
  10524. LLM_NORM_RMS, il);
  10525. cb(cur, "ffn_norm", il);
  10526. // MLP
  10527. cur = build_ffn(cur,
  10528. model.layers[il].ffn_up, NULL, NULL,
  10529. NULL, NULL, NULL,
  10530. model.layers[il].ffn_down, NULL, NULL,
  10531. NULL,
  10532. LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
  10533. cb(cur, "ffn_out", il);
  10534. // Post-MLP norm
  10535. cur = build_norm(cur,
  10536. model.layers[il].ffn_post_norm,
  10537. NULL,
  10538. LLM_NORM_RMS, il);
  10539. cb(cur, "post_mlp_norm", il);
  10540. }
  10541. // Add residual connection after post-MLP norm
  10542. inpL = ggml_add(ctx0, cur, ffn_inp);
  10543. cb(inpL, "l_out", il);
  10544. }
  10545. // Final norm
  10546. cur = build_norm(inpL,
  10547. model.output_norm,
  10548. NULL,
  10549. LLM_NORM_RMS, -1);
  10550. cb(cur, "result_norm", -1);
  10551. res->t_embd = cur;
  10552. // Output projection
  10553. cur = build_lora_mm(model.output, cur);
  10554. cb(cur, "result_output", -1);
  10555. res->t_logits = cur;
  10556. ggml_build_forward_expand(gf, cur);
  10557. }
  10558. };
  10559. struct llm_build_nemotron : public llm_graph_context {
  10560. llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10561. const int64_t n_embd_head = hparams.n_embd_head_v;
  10562. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10563. //GGML_ASSERT(n_embd_head == hparams.n_rot);
  10564. ggml_tensor * cur;
  10565. ggml_tensor * inpL;
  10566. inpL = build_inp_embd(model.tok_embd);
  10567. // inp_pos - contains the positions
  10568. ggml_tensor * inp_pos = build_inp_pos();
  10569. auto * inp_attn = build_attn_inp_kv_unified();
  10570. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10571. for (int il = 0; il < n_layer; ++il) {
  10572. ggml_tensor * inpSA = inpL;
  10573. // norm
  10574. cur = build_norm(inpL,
  10575. model.layers[il].attn_norm,
  10576. model.layers[il].attn_norm_b,
  10577. LLM_NORM, il);
  10578. cb(cur, "attn_norm", il);
  10579. // self-attention
  10580. {
  10581. // compute Q and K and RoPE them
  10582. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  10583. cb(Qcur, "Qcur", il);
  10584. if (model.layers[il].bq) {
  10585. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  10586. cb(Qcur, "Qcur", il);
  10587. }
  10588. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  10589. cb(Kcur, "Kcur", il);
  10590. if (model.layers[il].bk) {
  10591. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  10592. cb(Kcur, "Kcur", il);
  10593. }
  10594. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  10595. cb(Vcur, "Vcur", il);
  10596. if (model.layers[il].bv) {
  10597. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  10598. cb(Vcur, "Vcur", il);
  10599. }
  10600. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10601. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10602. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  10603. Qcur = ggml_rope_ext(
  10604. ctx0, Qcur, inp_pos, nullptr,
  10605. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10606. ext_factor, attn_factor, beta_fast, beta_slow
  10607. );
  10608. Kcur = ggml_rope_ext(
  10609. ctx0, Kcur, inp_pos, nullptr,
  10610. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10611. ext_factor, attn_factor, beta_fast, beta_slow
  10612. );
  10613. cb(Qcur, "Qcur", il);
  10614. cb(Kcur, "Kcur", il);
  10615. cb(Vcur, "Vcur", il);
  10616. cur = build_attn(inp_attn,
  10617. model.layers[il].wo, model.layers[il].bo,
  10618. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  10619. }
  10620. if (il == n_layer - 1 && inp_out_ids) {
  10621. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10622. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10623. }
  10624. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10625. cb(ffn_inp, "ffn_inp", il);
  10626. // feed-forward network
  10627. cur = build_norm(ffn_inp,
  10628. model.layers[il].ffn_norm,
  10629. model.layers[il].ffn_norm_b,
  10630. LLM_NORM, il);
  10631. cb(cur, "ffn_norm", il);
  10632. cur = build_ffn(cur,
  10633. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  10634. NULL, NULL, NULL,
  10635. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  10636. NULL,
  10637. LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
  10638. cur = ggml_add(ctx0, cur, ffn_inp);
  10639. cb(cur, "ffn_out", il);
  10640. cur = build_cvec(cur, il);
  10641. cb(cur, "l_out", il);
  10642. // input for next layer
  10643. inpL = cur;
  10644. }
  10645. cur = inpL;
  10646. cur = build_norm(cur,
  10647. model.output_norm, model.output_norm_b,
  10648. LLM_NORM, -1);
  10649. cb(cur, "result_norm", -1);
  10650. res->t_embd = cur;
  10651. // lm_head
  10652. cur = build_lora_mm(model.output, cur);
  10653. cb(cur, "result_output", -1);
  10654. res->t_logits = cur;
  10655. ggml_build_forward_expand(gf, cur);
  10656. }
  10657. };
  10658. struct llm_build_exaone : public llm_graph_context {
  10659. llm_build_exaone(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10660. const int64_t n_embd_head = hparams.n_embd_head_v;
  10661. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10662. GGML_ASSERT(n_embd_head == hparams.n_rot);
  10663. ggml_tensor * cur;
  10664. ggml_tensor * inpL;
  10665. inpL = build_inp_embd(model.tok_embd);
  10666. // inp_pos - contains the positions
  10667. ggml_tensor * inp_pos = build_inp_pos();
  10668. auto * inp_attn = build_attn_inp_kv_unified();
  10669. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10670. for (int il = 0; il < n_layer; ++il) {
  10671. ggml_tensor * inpSA = inpL;
  10672. // norm
  10673. cur = build_norm(inpL,
  10674. model.layers[il].attn_norm, NULL,
  10675. LLM_NORM_RMS, il);
  10676. cb(cur, "attn_norm", il);
  10677. // self-attention
  10678. {
  10679. // rope freq factors for llama3; may return nullptr for llama2 and other models
  10680. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  10681. // compute Q and K and RoPE them
  10682. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  10683. cb(Qcur, "Qcur", il);
  10684. if (model.layers[il].bq) {
  10685. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  10686. cb(Qcur, "Qcur", il);
  10687. }
  10688. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  10689. cb(Kcur, "Kcur", il);
  10690. if (model.layers[il].bk) {
  10691. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  10692. cb(Kcur, "Kcur", il);
  10693. }
  10694. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  10695. cb(Vcur, "Vcur", il);
  10696. if (model.layers[il].bv) {
  10697. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  10698. cb(Vcur, "Vcur", il);
  10699. }
  10700. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10701. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10702. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  10703. Qcur = ggml_rope_ext(
  10704. ctx0, Qcur, inp_pos, rope_factors,
  10705. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10706. ext_factor, attn_factor, beta_fast, beta_slow
  10707. );
  10708. Kcur = ggml_rope_ext(
  10709. ctx0, Kcur, inp_pos, rope_factors,
  10710. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10711. ext_factor, attn_factor, beta_fast, beta_slow
  10712. );
  10713. cb(Qcur, "Qcur", il);
  10714. cb(Kcur, "Kcur", il);
  10715. cb(Vcur, "Vcur", il);
  10716. cur = build_attn(inp_attn,
  10717. model.layers[il].wo, model.layers[il].bo,
  10718. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  10719. }
  10720. if (il == n_layer - 1 && inp_out_ids) {
  10721. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10722. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10723. }
  10724. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10725. cb(ffn_inp, "ffn_inp", il);
  10726. // feed-forward network
  10727. cur = build_norm(ffn_inp,
  10728. model.layers[il].ffn_norm, NULL,
  10729. LLM_NORM_RMS, il);
  10730. cb(cur, "ffn_norm", il);
  10731. cur = build_ffn(cur,
  10732. model.layers[il].ffn_up, NULL, NULL,
  10733. model.layers[il].ffn_gate, NULL, NULL,
  10734. model.layers[il].ffn_down, NULL, NULL,
  10735. NULL,
  10736. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10737. cb(cur, "ffn_out", il);
  10738. cur = ggml_add(ctx0, cur, ffn_inp);
  10739. cb(cur, "ffn_out", il);
  10740. cur = build_cvec(cur, il);
  10741. cb(cur, "l_out", il);
  10742. // input for next layer
  10743. inpL = cur;
  10744. }
  10745. cur = inpL;
  10746. cur = build_norm(cur,
  10747. model.output_norm, NULL,
  10748. LLM_NORM_RMS, -1);
  10749. cb(cur, "result_norm", -1);
  10750. res->t_embd = cur;
  10751. // lm_head
  10752. cur = build_lora_mm(model.output, cur);
  10753. cb(cur, "result_output", -1);
  10754. res->t_logits = cur;
  10755. ggml_build_forward_expand(gf, cur);
  10756. }
  10757. };
  10758. template <bool iswa>
  10759. struct llm_build_exaone4 : public llm_graph_context {
  10760. llm_build_exaone4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  10761. const int64_t n_embd_head = hparams.n_embd_head_k;
  10762. GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
  10763. GGML_ASSERT(n_embd_head == hparams.n_rot);
  10764. ggml_tensor * cur;
  10765. ggml_tensor * inpL;
  10766. inpL = build_inp_embd(model.tok_embd);
  10767. // inp_pos - contains the positions
  10768. ggml_tensor * inp_pos = build_inp_pos();
  10769. using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
  10770. inp_attn_type * inp_attn = nullptr;
  10771. if constexpr (iswa) {
  10772. inp_attn = build_attn_inp_kv_unified_iswa();
  10773. } else {
  10774. inp_attn = build_attn_inp_kv_unified();
  10775. }
  10776. ggml_tensor * inp_out_ids = build_inp_out_ids();
  10777. for (int il = 0; il < n_layer; ++il) {
  10778. ggml_tensor * inpSA = inpL;
  10779. // use RoPE for SWA layers or non-SWA models
  10780. const bool use_rope = hparams.is_swa(il) || hparams.swa_type == LLAMA_SWA_TYPE_NONE;
  10781. cur = inpL;
  10782. // self-attention
  10783. {
  10784. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  10785. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  10786. cb(Qcur, "Qcur", il);
  10787. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  10788. cb(Kcur, "Kcur", il);
  10789. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  10790. cb(Vcur, "Vcur", il);
  10791. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10792. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10793. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  10794. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  10795. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  10796. cb(Qcur, "Qcur_normed", il);
  10797. cb(Kcur, "Kcur_normed", il);
  10798. if (use_rope) {
  10799. Qcur = ggml_rope_ext(
  10800. ctx0, Qcur, inp_pos, rope_factors,
  10801. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10802. ext_factor, attn_factor, beta_fast, beta_slow
  10803. );
  10804. Kcur = ggml_rope_ext(
  10805. ctx0, Kcur, inp_pos, rope_factors,
  10806. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10807. ext_factor, attn_factor, beta_fast, beta_slow
  10808. );
  10809. }
  10810. cb(Qcur, "Qcur", il);
  10811. cb(Kcur, "Kcur", il);
  10812. cb(Vcur, "Vcur", il);
  10813. cur = build_attn(inp_attn,
  10814. model.layers[il].wo, NULL,
  10815. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  10816. cb(cur, "attn_out", il);
  10817. }
  10818. if (il == n_layer - 1 && inp_out_ids) {
  10819. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10820. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10821. }
  10822. cur = build_norm(cur,
  10823. model.layers[il].attn_post_norm, NULL,
  10824. LLM_NORM_RMS, il);
  10825. cb(cur, "attn_post_norm", il);
  10826. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10827. cb(ffn_inp, "ffn_inp", il);
  10828. // feed-forward network
  10829. cur = build_ffn(ffn_inp,
  10830. model.layers[il].ffn_up, NULL, NULL,
  10831. model.layers[il].ffn_gate, NULL, NULL,
  10832. model.layers[il].ffn_down, NULL, NULL,
  10833. NULL,
  10834. LLM_FFN_SILU, LLM_FFN_PAR, il);
  10835. cb(cur, "ffn_out", il);
  10836. cur = build_norm(cur,
  10837. model.layers[il].ffn_post_norm, NULL,
  10838. LLM_NORM_RMS, -1);
  10839. cb(cur, "ffn_post_norm", -1);
  10840. cur = ggml_add(ctx0, cur, ffn_inp);
  10841. cur = build_cvec(cur, il);
  10842. cb(cur, "l_out", il);
  10843. // input for next layer
  10844. inpL = cur;
  10845. }
  10846. cur = inpL;
  10847. cur = build_norm(cur,
  10848. model.output_norm, NULL,
  10849. LLM_NORM_RMS, -1);
  10850. cb(cur, "result_norm", -1);
  10851. res->t_embd = cur;
  10852. // lm_head
  10853. cur = build_lora_mm(model.output, cur);
  10854. cb(cur, "result_output", -1);
  10855. res->t_logits = cur;
  10856. ggml_build_forward_expand(gf, cur);
  10857. }
  10858. };
  10859. struct llm_build_rwkv6_base : public llm_graph_context {
  10860. const llama_model & model;
  10861. llm_build_rwkv6_base(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
  10862. }
  10863. ggml_tensor * build_rwkv6_channel_mix(
  10864. const llama_layer * layer,
  10865. ggml_tensor * cur,
  10866. ggml_tensor * x_prev,
  10867. llm_arch arch) const {
  10868. ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
  10869. switch (arch) {
  10870. case LLM_ARCH_RWKV6:
  10871. {
  10872. ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
  10873. ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur);
  10874. ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr));
  10875. ggml_tensor * k = ggml_sqr(
  10876. ctx0,
  10877. ggml_relu(
  10878. ctx0,
  10879. build_lora_mm(layer->channel_mix_key, xk)
  10880. )
  10881. );
  10882. cur = ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k));
  10883. } break;
  10884. default:
  10885. GGML_ABORT("fatal error");
  10886. }
  10887. return cur;
  10888. }
  10889. ggml_tensor * build_rwkv6_time_mix(
  10890. llm_graph_input_rs * inp,
  10891. ggml_tensor * cur,
  10892. ggml_tensor * x_prev,
  10893. const llama_ubatch & ubatch,
  10894. int il) const {
  10895. const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
  10896. const auto n_tokens = ubatch.n_tokens;
  10897. const auto n_seqs = ubatch.n_seqs;
  10898. const auto n_seq_tokens = ubatch.n_seq_tokens;
  10899. const auto n_embd = hparams.n_embd;
  10900. const auto head_size = hparams.wkv_head_size;
  10901. const auto n_head = n_embd / head_size;
  10902. const auto n_head_kv = hparams.n_head_kv(il);
  10903. const auto kv_head = mctx_cur->get_head();
  10904. const auto & layer = model.layers[il];
  10905. bool is_qrwkv = layer.time_mix_first == nullptr;
  10906. ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
  10907. sx = ggml_reshape_2d(ctx0, sx, n_embd, n_tokens);
  10908. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  10909. ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_x), cur);
  10910. xxx = ggml_reshape_4d(
  10911. ctx0,
  10912. ggml_tanh(
  10913. ctx0,
  10914. ggml_mul_mat(ctx0, layer.time_mix_w1, xxx)
  10915. ),
  10916. layer.time_mix_w1->ne[1] / 5, 1, 5, n_tokens
  10917. );
  10918. xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2));
  10919. xxx = ggml_mul_mat(
  10920. ctx0,
  10921. ggml_reshape_4d(
  10922. ctx0,
  10923. layer.time_mix_w2,
  10924. layer.time_mix_w2->ne[0], layer.time_mix_w2->ne[1], 1, 5
  10925. ),
  10926. xxx
  10927. );
  10928. ggml_tensor *xw, *xk, *xv, *xr, *xg;
  10929. if (layer.time_mix_lerp_fused) {
  10930. // fusing these weights makes some performance improvement
  10931. sx = ggml_reshape_3d(ctx0, sx, n_embd, 1, n_tokens);
  10932. cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
  10933. xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer.time_mix_lerp_fused), sx), cur);
  10934. xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
  10935. xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
  10936. xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
  10937. xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
  10938. xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
  10939. } else {
  10940. // for backward compatibility
  10941. xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
  10942. xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
  10943. xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
  10944. xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
  10945. xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
  10946. xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer.time_mix_lerp_w), sx), cur);
  10947. xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer.time_mix_lerp_k), sx), cur);
  10948. xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer.time_mix_lerp_v), sx), cur);
  10949. xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer.time_mix_lerp_r), sx), cur);
  10950. xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer.time_mix_lerp_g), sx), cur);
  10951. }
  10952. ggml_tensor * r = build_lora_mm(layer.time_mix_receptance, xr);
  10953. ggml_tensor * k = build_lora_mm(layer.time_mix_key, xk);
  10954. ggml_tensor * v = build_lora_mm(layer.time_mix_value, xv);
  10955. if (layer.time_mix_receptance_b) {
  10956. r = ggml_add(ctx0, r, layer.time_mix_receptance_b);
  10957. }
  10958. if (layer.time_mix_key_b) {
  10959. k = ggml_add(ctx0, k, layer.time_mix_key_b);
  10960. }
  10961. if (layer.time_mix_value_b) {
  10962. v = ggml_add(ctx0, v, layer.time_mix_value_b);
  10963. }
  10964. ggml_tensor * g = build_lora_mm(layer.time_mix_gate, xg);
  10965. if (is_qrwkv) {
  10966. g = ggml_sigmoid(ctx0, g);
  10967. } else {
  10968. g = ggml_silu(ctx0, g);
  10969. }
  10970. if (n_head_kv != 0 && n_head_kv != n_head) {
  10971. GGML_ASSERT(n_head % n_head_kv == 0);
  10972. k = ggml_reshape_4d(ctx0, k, head_size, 1, n_head_kv, n_tokens);
  10973. v = ggml_reshape_4d(ctx0, v, head_size, 1, n_head_kv, n_tokens);
  10974. ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, n_head / n_head_kv, n_head_kv, n_tokens);
  10975. k = ggml_repeat(ctx0, k, tmp);
  10976. v = ggml_repeat(ctx0, v, tmp);
  10977. }
  10978. k = ggml_reshape_3d(ctx0, k, head_size, n_head, n_tokens);
  10979. v = ggml_reshape_3d(ctx0, v, head_size, n_head, n_tokens);
  10980. r = ggml_reshape_3d(ctx0, r, head_size, n_head, n_tokens);
  10981. ggml_tensor * w = ggml_mul_mat(
  10982. ctx0,
  10983. layer.time_mix_decay_w2,
  10984. ggml_tanh(
  10985. ctx0,
  10986. ggml_mul_mat(ctx0, layer.time_mix_decay_w1, xw)
  10987. )
  10988. );
  10989. w = ggml_add(ctx0, w, layer.time_mix_decay);
  10990. w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w)));
  10991. w = ggml_reshape_3d(ctx0, w, head_size, n_head, n_tokens);
  10992. if (is_qrwkv) {
  10993. // k = k * (1 - w)
  10994. k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
  10995. }
  10996. ggml_tensor * wkv_state = build_rs(
  10997. inp, mctx_cur->get_s_l(il),
  10998. hparams.n_embd_s(), n_seqs);
  10999. ggml_tensor * wkv_output;
  11000. if (is_qrwkv) {
  11001. wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, wkv_state, pow(head_size, -0.5f));
  11002. } else {
  11003. wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer.time_mix_first, w, wkv_state);
  11004. }
  11005. cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
  11006. wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
  11007. ggml_build_forward_expand(
  11008. gf,
  11009. ggml_cpy(
  11010. ctx0,
  11011. wkv_state,
  11012. ggml_view_1d(
  11013. ctx0,
  11014. mctx_cur->get_s_l(il),
  11015. hparams.n_embd_s() * n_seqs,
  11016. hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
  11017. )
  11018. )
  11019. );
  11020. if (!is_qrwkv) {
  11021. // group norm with head_count groups
  11022. cur = ggml_reshape_3d(ctx0, cur, n_embd / n_head, n_head, n_tokens);
  11023. cur = ggml_norm(ctx0, cur, 64e-5f);
  11024. // Convert back to regular vectors.
  11025. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  11026. cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.time_mix_ln), layer.time_mix_ln_b);
  11027. } else {
  11028. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  11029. }
  11030. cur = ggml_mul(ctx0, cur, g);
  11031. cur = build_lora_mm(layer.time_mix_output, cur);
  11032. return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
  11033. }
  11034. };
  11035. struct llm_build_rwkv6 : public llm_build_rwkv6_base {
  11036. llm_build_rwkv6(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) {
  11037. GGML_ASSERT(hparams.token_shift_count == 2);
  11038. ggml_tensor * cur;
  11039. ggml_tensor * inpL;
  11040. inpL = build_inp_embd(model.tok_embd);
  11041. inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
  11042. auto * rs_inp = build_rs_inp();
  11043. const auto n_embd = hparams.n_embd;
  11044. const auto n_seq_tokens = ubatch.n_seq_tokens;
  11045. const auto n_seqs = ubatch.n_seqs;
  11046. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11047. for (int il = 0; il < n_layer; ++il) {
  11048. const llama_layer * layer = &model.layers[il];
  11049. inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
  11050. ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
  11051. ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
  11052. ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
  11053. ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
  11054. cb(att_norm, "attn_norm", il);
  11055. ggml_tensor * x_prev = ggml_concat(
  11056. ctx0,
  11057. att_shift,
  11058. ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
  11059. 1
  11060. );
  11061. cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il);
  11062. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  11063. cb(ffn_inp, "ffn_inp", il);
  11064. ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
  11065. cb(ffn_norm, "ffn_norm", il);
  11066. x_prev = ggml_concat(
  11067. ctx0,
  11068. ffn_shift,
  11069. ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0),
  11070. 1
  11071. );
  11072. token_shift = ggml_concat(ctx0,
  11073. ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)),
  11074. ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
  11075. 1
  11076. );
  11077. ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
  11078. ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
  11079. ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
  11080. x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
  11081. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  11082. if (il == n_layer - 1 && inp_out_ids) {
  11083. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  11084. ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
  11085. x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
  11086. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11087. }
  11088. cur = build_rwkv6_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6);
  11089. cur = ggml_add(ctx0, cur, ffn_inp);
  11090. if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
  11091. cur = ggml_scale(ctx0, cur, 0.5F);
  11092. }
  11093. cur = build_cvec(cur, il);
  11094. cb(cur, "l_out", il);
  11095. // input for next layer
  11096. inpL = cur;
  11097. }
  11098. cur = inpL;
  11099. cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
  11100. cb(cur, "result_norm", -1);
  11101. res->t_embd = cur;
  11102. cur = build_lora_mm(model.output, cur);
  11103. cb(cur, "result_output", -1);
  11104. res->t_logits = cur;
  11105. ggml_build_forward_expand(gf, cur);
  11106. }
  11107. };
  11108. // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
  11109. struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
  11110. llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) {
  11111. GGML_ASSERT(n_embd == hparams.n_embd_r());
  11112. ggml_tensor * cur;
  11113. ggml_tensor * inpL;
  11114. inpL = build_inp_embd(model.tok_embd);
  11115. auto * rs_inp = build_rs_inp();
  11116. const auto n_embd = hparams.n_embd;
  11117. const auto n_seq_tokens = ubatch.n_seq_tokens;
  11118. const auto n_seqs = ubatch.n_seqs;
  11119. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11120. for (int il = 0; il < n_layer; ++il) {
  11121. const llama_layer * layer = &model.layers[il];
  11122. inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
  11123. ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
  11124. ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
  11125. cb(att_norm, "attn_norm", il);
  11126. ggml_tensor * x_prev = ggml_concat(
  11127. ctx0,
  11128. token_shift,
  11129. ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
  11130. 1
  11131. );
  11132. cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il);
  11133. token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
  11134. ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
  11135. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  11136. cb(ffn_inp, "ffn_inp", il);
  11137. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  11138. ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
  11139. if (il == n_layer - 1 && inp_out_ids) {
  11140. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11141. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  11142. }
  11143. // feed-forward network
  11144. cur = build_norm(ffn_inp,
  11145. model.layers[il].ffn_norm, NULL,
  11146. LLM_NORM_RMS, il);
  11147. cb(cur, "ffn_norm", il);
  11148. cur = build_ffn(cur,
  11149. model.layers[il].ffn_up, NULL, NULL,
  11150. model.layers[il].ffn_gate, NULL, NULL,
  11151. model.layers[il].ffn_down, NULL, NULL,
  11152. NULL,
  11153. LLM_FFN_SILU, LLM_FFN_PAR, il);
  11154. cb(cur, "ffn_out", il);
  11155. cur = ggml_add(ctx0, cur, ffn_inp);
  11156. cur = build_cvec(cur, il);
  11157. cb(cur, "l_out", il);
  11158. // input for next layer
  11159. inpL = cur;
  11160. }
  11161. cur = inpL;
  11162. cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
  11163. cb(cur, "result_norm", -1);
  11164. res->t_embd = cur;
  11165. cur = build_lora_mm(model.output, cur);
  11166. cb(cur, "result_output", -1);
  11167. res->t_logits = cur;
  11168. ggml_build_forward_expand(gf, cur);
  11169. }
  11170. };
  11171. struct llm_build_rwkv7_base : public llm_graph_context {
  11172. const llama_model & model;
  11173. llm_build_rwkv7_base(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
  11174. }
  11175. ggml_tensor * build_rwkv7_channel_mix(
  11176. const llama_layer * layer,
  11177. ggml_tensor * cur,
  11178. ggml_tensor * x_prev,
  11179. llm_arch arch) const {
  11180. ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
  11181. switch (arch) {
  11182. case LLM_ARCH_RWKV7:
  11183. {
  11184. ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
  11185. ggml_tensor * k = ggml_sqr(
  11186. ctx0,
  11187. ggml_relu(
  11188. ctx0,
  11189. build_lora_mm(layer->channel_mix_key, xk)
  11190. )
  11191. );
  11192. cur = build_lora_mm(layer->channel_mix_value, k);
  11193. } break;
  11194. default:
  11195. GGML_ABORT("fatal error");
  11196. }
  11197. return cur;
  11198. }
  11199. ggml_tensor * build_rwkv7_time_mix(
  11200. llm_graph_input_rs * inp,
  11201. ggml_tensor * cur,
  11202. ggml_tensor * x_prev,
  11203. ggml_tensor *& first_layer_value,
  11204. const llama_ubatch & ubatch,
  11205. int il) const {
  11206. const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
  11207. const auto n_tokens = ubatch.n_tokens;
  11208. const auto n_seqs = ubatch.n_seqs;
  11209. const auto n_embd = hparams.n_embd;
  11210. const auto head_size = hparams.wkv_head_size;
  11211. const auto head_count = n_embd / head_size;
  11212. const auto n_seq_tokens = ubatch.n_seq_tokens;
  11213. const auto kv_head = mctx_cur->get_head();
  11214. const auto & layer = model.layers[il];
  11215. bool has_gating = layer.time_mix_g1 && layer.time_mix_g2;
  11216. ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
  11217. ggml_tensor * dummy = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_embd, n_seq_tokens, n_seqs, has_gating ? 6 : 5);
  11218. sx = ggml_repeat(ctx0, sx, dummy);
  11219. ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_fused), cur);
  11220. ggml_tensor * xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
  11221. ggml_tensor * xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
  11222. ggml_tensor * xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
  11223. ggml_tensor * xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
  11224. ggml_tensor * xa = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
  11225. ggml_tensor * xg = has_gating ? ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 5 * sizeof(float)) : nullptr;
  11226. ggml_tensor * r = build_lora_mm(layer.time_mix_receptance, xr);
  11227. ggml_tensor * w = ggml_add(
  11228. ctx0,
  11229. ggml_mul_mat(ctx0, layer.time_mix_w2, ggml_tanh(ctx0, ggml_mul_mat(ctx0, layer.time_mix_w1, xw))),
  11230. layer.time_mix_w0
  11231. );
  11232. w = ggml_exp(ctx0, ggml_scale(ctx0, ggml_sigmoid(ctx0, w), -0.606531));
  11233. ggml_tensor * k = build_lora_mm(layer.time_mix_key, xk);
  11234. ggml_tensor * v = build_lora_mm(layer.time_mix_value, xv);
  11235. if (first_layer_value == nullptr) {
  11236. first_layer_value = v;
  11237. } else {
  11238. // Add the first layer value as a residual connection.
  11239. v = ggml_add(ctx0, v,
  11240. ggml_mul(ctx0,
  11241. ggml_sub(ctx0, first_layer_value, v),
  11242. ggml_sigmoid(ctx0, ggml_add(ctx0,
  11243. ggml_mul_mat(ctx0, layer.time_mix_v2, ggml_mul_mat(ctx0, layer.time_mix_v1, xv)),
  11244. layer.time_mix_v0
  11245. )
  11246. )
  11247. )
  11248. );
  11249. }
  11250. ggml_tensor * g = nullptr;
  11251. if (layer.time_mix_g1 && layer.time_mix_g2) {
  11252. g = ggml_mul_mat(ctx0, layer.time_mix_g2, ggml_sigmoid(ctx0, ggml_mul_mat(ctx0, layer.time_mix_g1, xg)));
  11253. }
  11254. ggml_tensor * a = ggml_sigmoid(ctx0,
  11255. ggml_add(
  11256. ctx0,
  11257. ggml_mul_mat(ctx0, layer.time_mix_a2, ggml_mul_mat(ctx0, layer.time_mix_a1, xa)),
  11258. layer.time_mix_a0
  11259. )
  11260. );
  11261. ggml_tensor * kk = ggml_reshape_3d(ctx0, ggml_mul(ctx0, k, layer.time_mix_k_k), head_size, head_count, n_tokens);
  11262. kk = ggml_l2_norm(ctx0, kk, 1e-12);
  11263. ggml_tensor * ka = ggml_mul(ctx0, k, layer.time_mix_k_a);
  11264. k = ggml_add(ctx0, k, ggml_sub(ctx0, ggml_mul(ctx0, a, ka), ka));
  11265. r = ggml_reshape_3d(ctx0, r, head_size, head_count, n_tokens);
  11266. w = ggml_reshape_3d(ctx0, w, head_size, head_count, n_tokens);
  11267. k = ggml_reshape_3d(ctx0, k, head_size, head_count, n_tokens);
  11268. v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
  11269. a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
  11270. ggml_tensor * wkv_state = build_rs(
  11271. inp, mctx_cur->get_s_l(il),
  11272. hparams.n_embd_s(), n_seqs);
  11273. ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
  11274. cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
  11275. wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
  11276. ggml_build_forward_expand(
  11277. gf,
  11278. ggml_cpy(
  11279. ctx0,
  11280. wkv_state,
  11281. ggml_view_1d(
  11282. ctx0,
  11283. mctx_cur->get_s_l(il),
  11284. hparams.n_embd_s() * n_seqs,
  11285. hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
  11286. )
  11287. )
  11288. );
  11289. if (layer.time_mix_ln && layer.time_mix_ln_b) {
  11290. // group norm with head_count groups
  11291. cur = ggml_reshape_3d(ctx0, cur, n_embd / head_count, head_count, n_tokens);
  11292. cur = ggml_norm(ctx0, cur, 64e-5f);
  11293. // Convert back to regular vectors.
  11294. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  11295. cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.time_mix_ln), layer.time_mix_ln_b);
  11296. } else {
  11297. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  11298. }
  11299. ggml_tensor * rk = ggml_sum_rows(ctx0,
  11300. ggml_mul(ctx0, ggml_mul(ctx0, k, r), ggml_reshape_2d(ctx0, layer.time_mix_r_k, head_size, head_count)));
  11301. cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, ggml_mul(ctx0, v, rk), n_embd, n_tokens));
  11302. if (has_gating) {
  11303. cur = ggml_mul(ctx0, cur, g);
  11304. }
  11305. cur = build_lora_mm(layer.time_mix_output, cur);
  11306. return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
  11307. }
  11308. };
  11309. struct llm_build_rwkv7 : public llm_build_rwkv7_base {
  11310. llm_build_rwkv7(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv7_base(model, params) {
  11311. GGML_ASSERT(hparams.token_shift_count == 2);
  11312. ggml_tensor * cur;
  11313. ggml_tensor * inpL;
  11314. ggml_tensor * v_first = nullptr;
  11315. inpL = build_inp_embd(model.tok_embd);
  11316. inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
  11317. auto * rs_inp = build_rs_inp();
  11318. const auto n_embd = hparams.n_embd;
  11319. const auto n_seq_tokens = ubatch.n_seq_tokens;
  11320. const auto n_seqs = ubatch.n_seqs;
  11321. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11322. for (int il = 0; il < n_layer; ++il) {
  11323. const llama_layer * layer = &model.layers[il];
  11324. inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
  11325. ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
  11326. ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
  11327. ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
  11328. ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
  11329. cb(att_norm, "attn_norm", il);
  11330. ggml_tensor * x_prev = ggml_concat(
  11331. ctx0,
  11332. att_shift,
  11333. ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
  11334. 1
  11335. );
  11336. cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il);
  11337. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  11338. cb(ffn_inp, "ffn_inp", il);
  11339. ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
  11340. cb(ffn_norm, "ffn_norm", il);
  11341. x_prev = ggml_concat(
  11342. ctx0,
  11343. ffn_shift,
  11344. ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0),
  11345. 1
  11346. );
  11347. token_shift = ggml_concat(ctx0,
  11348. ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)),
  11349. ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
  11350. 1
  11351. );
  11352. ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
  11353. ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
  11354. ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
  11355. x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
  11356. if (il == n_layer - 1 && inp_out_ids) {
  11357. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  11358. ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
  11359. x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
  11360. }
  11361. cur = build_rwkv7_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV7);
  11362. cur = ggml_add(ctx0, cur, ffn_inp);
  11363. cur = build_cvec(cur, il);
  11364. cb(cur, "l_out", il);
  11365. // input for next layer
  11366. inpL = cur;
  11367. }
  11368. cur = inpL;
  11369. cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
  11370. cb(cur, "result_norm", -1);
  11371. res->t_embd = cur;
  11372. cur = build_lora_mm(model.output, cur);
  11373. cb(cur, "result_output", -1);
  11374. res->t_logits = cur;
  11375. ggml_build_forward_expand(gf, cur);
  11376. }
  11377. };
  11378. struct llm_build_arwkv7 : public llm_build_rwkv7_base {
  11379. llm_build_arwkv7(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv7_base(model, params) {
  11380. GGML_ASSERT(n_embd == hparams.n_embd_r());
  11381. ggml_tensor * cur;
  11382. ggml_tensor * inpL;
  11383. ggml_tensor * v_first = nullptr;
  11384. inpL = build_inp_embd(model.tok_embd);
  11385. auto * rs_inp = build_rs_inp();
  11386. const auto n_embd = hparams.n_embd;
  11387. const auto n_seq_tokens = ubatch.n_seq_tokens;
  11388. const auto n_seqs = ubatch.n_seqs;
  11389. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11390. for (int il = 0; il < n_layer; ++il) {
  11391. const llama_layer * layer = &model.layers[il];
  11392. inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
  11393. ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
  11394. ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
  11395. cb(att_norm, "attn_norm", il);
  11396. ggml_tensor * x_prev = ggml_concat(
  11397. ctx0,
  11398. token_shift,
  11399. ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
  11400. 1
  11401. );
  11402. cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il);
  11403. token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
  11404. ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
  11405. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  11406. cb(ffn_inp, "ffn_inp", il);
  11407. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  11408. ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
  11409. if (il == n_layer - 1 && inp_out_ids) {
  11410. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11411. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  11412. }
  11413. // feed-forward network
  11414. cur = build_norm(ffn_inp,
  11415. model.layers[il].ffn_norm, NULL,
  11416. LLM_NORM_RMS, il);
  11417. cb(cur, "ffn_norm", il);
  11418. cur = build_ffn(cur,
  11419. model.layers[il].ffn_up, NULL, NULL,
  11420. model.layers[il].ffn_gate, NULL, NULL,
  11421. model.layers[il].ffn_down, NULL, NULL,
  11422. NULL,
  11423. LLM_FFN_SILU, LLM_FFN_PAR, il);
  11424. cb(cur, "ffn_out", il);
  11425. cur = ggml_add(ctx0, cur, ffn_inp);
  11426. cur = build_cvec(cur, il);
  11427. cb(cur, "l_out", il);
  11428. // input for next layer
  11429. inpL = cur;
  11430. }
  11431. cur = inpL;
  11432. cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
  11433. cb(cur, "result_norm", -1);
  11434. res->t_embd = cur;
  11435. cur = build_lora_mm(model.output, cur);
  11436. cb(cur, "result_output", -1);
  11437. res->t_logits = cur;
  11438. ggml_build_forward_expand(gf, cur);
  11439. }
  11440. };
  11441. struct llm_build_granite : public llm_graph_context {
  11442. llm_build_granite(
  11443. const llama_model & model,
  11444. const llm_graph_params & params)
  11445. : llm_graph_context(params) {
  11446. const int64_t n_embd_head = hparams.n_embd_head_v;
  11447. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  11448. GGML_ASSERT(n_embd_head == hparams.n_rot);
  11449. ggml_tensor * cur;
  11450. ggml_tensor * inpL;
  11451. inpL = build_inp_embd(model.tok_embd);
  11452. // inp_pos - built only if rope enabled
  11453. ggml_tensor * inp_pos = nullptr;
  11454. if (hparams.rope_finetuned) {
  11455. inp_pos = build_inp_pos();
  11456. }
  11457. auto * inp_attn = build_attn_inp_kv_unified();
  11458. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11459. for (int il = 0; il < n_layer; ++il) {
  11460. ggml_tensor * inpSA = inpL;
  11461. // norm
  11462. cur = build_norm(inpL,
  11463. model.layers[il].attn_norm, NULL,
  11464. LLM_NORM_RMS, il);
  11465. cb(cur, "attn_norm", il);
  11466. // self-attention
  11467. cur = build_attention_layer(
  11468. cur, inp_pos, inp_attn,
  11469. model, n_embd_head, il);
  11470. if (il == n_layer - 1 && inp_out_ids) {
  11471. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11472. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  11473. }
  11474. // ffn
  11475. cur = build_layer_ffn(cur, inpSA, model, il);
  11476. // input for next layer
  11477. inpL = cur;
  11478. }
  11479. cur = inpL;
  11480. cur = build_norm(cur,
  11481. model.output_norm, NULL,
  11482. LLM_NORM_RMS, -1);
  11483. cb(cur, "result_norm", -1);
  11484. res->t_embd = cur;
  11485. // lm_head
  11486. cur = build_lora_mm(model.output, cur);
  11487. // For Granite architectures - scale logits
  11488. cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
  11489. cb(cur, "result_output", -1);
  11490. res->t_logits = cur;
  11491. ggml_build_forward_expand(gf, cur);
  11492. }
  11493. ggml_tensor * build_attention_layer(
  11494. ggml_tensor * cur,
  11495. ggml_tensor * inp_pos,
  11496. llm_graph_input_attn_kv_unified * inp_attn,
  11497. const llama_model & model,
  11498. const int64_t n_embd_head,
  11499. const int il) {
  11500. // compute Q and K and (optionally) RoPE them
  11501. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  11502. cb(Qcur, "Qcur", il);
  11503. if (model.layers[il].bq) {
  11504. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  11505. cb(Qcur, "Qcur", il);
  11506. }
  11507. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  11508. cb(Kcur, "Kcur", il);
  11509. if (model.layers[il].bk) {
  11510. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  11511. cb(Kcur, "Kcur", il);
  11512. }
  11513. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  11514. cb(Vcur, "Vcur", il);
  11515. if (model.layers[il].bv) {
  11516. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  11517. cb(Vcur, "Vcur", il);
  11518. }
  11519. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
  11520. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
  11521. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
  11522. const bool use_rope = hparams.rope_finetuned;
  11523. if (use_rope) {
  11524. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  11525. Qcur = ggml_rope_ext(
  11526. ctx0, Qcur, inp_pos, rope_factors,
  11527. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11528. ext_factor, attn_factor, beta_fast, beta_slow
  11529. );
  11530. Kcur = ggml_rope_ext(
  11531. ctx0, Kcur, inp_pos, rope_factors,
  11532. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11533. ext_factor, attn_factor, beta_fast, beta_slow
  11534. );
  11535. }
  11536. cb(Qcur, "Qcur", il);
  11537. cb(Kcur, "Kcur", il);
  11538. cb(Vcur, "Vcur", il);
  11539. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  11540. cur = build_attn(inp_attn,
  11541. model.layers[il].wo, model.layers[il].bo,
  11542. Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
  11543. cb(cur, "attn_out", il);
  11544. return cur;
  11545. }
  11546. ggml_tensor * build_layer_ffn(
  11547. ggml_tensor * cur,
  11548. ggml_tensor * inpSA,
  11549. const llama_model & model,
  11550. const int il) {
  11551. // For Granite architectures - scale residual
  11552. if (hparams.f_residual_scale) {
  11553. cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
  11554. }
  11555. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  11556. cb(ffn_inp, "ffn_inp", il);
  11557. // feed-forward network (non-MoE)
  11558. if (model.layers[il].ffn_gate_inp == nullptr) {
  11559. cur = build_norm(ffn_inp,
  11560. model.layers[il].ffn_norm, NULL,
  11561. LLM_NORM_RMS, il);
  11562. cb(cur, "ffn_norm", il);
  11563. cur = build_ffn(cur,
  11564. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  11565. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  11566. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  11567. NULL,
  11568. LLM_FFN_SILU, LLM_FFN_PAR, il);
  11569. cb(cur, "ffn_out", il);
  11570. } else {
  11571. // MoE branch
  11572. cur = build_norm(ffn_inp,
  11573. model.layers[il].ffn_norm, NULL,
  11574. LLM_NORM_RMS, il);
  11575. cb(cur, "ffn_norm", il);
  11576. ggml_tensor * moe_out = build_moe_ffn(cur,
  11577. model.layers[il].ffn_gate_inp,
  11578. model.layers[il].ffn_up_exps,
  11579. model.layers[il].ffn_gate_exps,
  11580. model.layers[il].ffn_down_exps,
  11581. nullptr,
  11582. n_expert, n_expert_used,
  11583. LLM_FFN_SILU, true,
  11584. false, 0.0,
  11585. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  11586. il);
  11587. cb(moe_out, "ffn_moe_out", il);
  11588. // For Granite MoE Shared
  11589. if (hparams.n_ff_shexp > 0) {
  11590. ggml_tensor * ffn_shexp = build_ffn(cur,
  11591. model.layers[il].ffn_up_shexp, NULL, NULL,
  11592. model.layers[il].ffn_gate_shexp, NULL, NULL,
  11593. model.layers[il].ffn_down_shexp, NULL, NULL,
  11594. NULL,
  11595. LLM_FFN_SILU, LLM_FFN_PAR, il);
  11596. cb(ffn_shexp, "ffn_shexp", il);
  11597. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  11598. cb(cur, "ffn_out", il);
  11599. } else {
  11600. cur = moe_out;
  11601. }
  11602. }
  11603. // For Granite architectures - scale residual
  11604. if (hparams.f_residual_scale) {
  11605. cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
  11606. }
  11607. cur = ggml_add(ctx0, cur, ffn_inp);
  11608. cb(cur, "ffn_out", il);
  11609. cur = build_cvec(cur, il);
  11610. cb(cur, "l_out", il);
  11611. return cur;
  11612. }
  11613. };
  11614. struct llm_build_granite_hybrid : public llm_graph_context_mamba {
  11615. llm_build_granite_hybrid(
  11616. const llama_model & model,
  11617. const llm_graph_params & params) :
  11618. llm_graph_context_mamba(params) {
  11619. const int64_t n_embd_head = hparams.n_embd_head_v;
  11620. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  11621. ggml_tensor * cur;
  11622. ggml_tensor * inpL;
  11623. inpL = build_inp_embd(model.tok_embd);
  11624. auto * inp = build_inp_mem_hybrid();
  11625. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11626. // Positional embeddings populated if rope enabled
  11627. ggml_tensor * inp_pos = nullptr;
  11628. if (hparams.rope_finetuned) {
  11629. inp_pos = build_inp_pos();
  11630. }
  11631. for (int il = 0; il < n_layer; ++il) {
  11632. struct ggml_tensor * inpSA = inpL;
  11633. // norm
  11634. cur = build_norm(inpL,
  11635. model.layers[il].attn_norm, NULL,
  11636. LLM_NORM_RMS, il);
  11637. cb(cur, "attn_norm", il);
  11638. if (hparams.is_recurrent(il)) {
  11639. // ssm layer //
  11640. cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
  11641. } else {
  11642. // attention layer //
  11643. cur = build_attention_layer(
  11644. cur, inp_pos, inp->get_attn(), model,
  11645. n_embd_head, il);
  11646. }
  11647. if (il == n_layer - 1 && inp_out_ids) {
  11648. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11649. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  11650. }
  11651. // ffn
  11652. cur = build_layer_ffn(cur, inpSA, model, il);
  11653. // input for next layer
  11654. inpL = cur;
  11655. }
  11656. cur = inpL;
  11657. cur = build_norm(cur,
  11658. model.output_norm, NULL,
  11659. LLM_NORM_RMS, -1);
  11660. cb(cur, "result_norm", -1);
  11661. res->t_embd = cur;
  11662. // lm_head
  11663. cur = build_lora_mm(model.output, cur);
  11664. // For Granite architectures - scale logits
  11665. if (hparams.f_logit_scale) {
  11666. cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
  11667. }
  11668. cb(cur, "result_output", -1);
  11669. res->t_logits = cur;
  11670. ggml_build_forward_expand(gf, cur);
  11671. }
  11672. ggml_tensor * build_attention_layer(
  11673. ggml_tensor * cur,
  11674. ggml_tensor * inp_pos,
  11675. llm_graph_input_attn_kv_unified * inp_attn,
  11676. const llama_model & model,
  11677. const int64_t n_embd_head,
  11678. const int il) {
  11679. // compute Q and K and (optionally) RoPE them
  11680. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  11681. cb(Qcur, "Qcur", il);
  11682. if (model.layers[il].bq) {
  11683. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  11684. cb(Qcur, "Qcur", il);
  11685. }
  11686. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  11687. cb(Kcur, "Kcur", il);
  11688. if (model.layers[il].bk) {
  11689. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  11690. cb(Kcur, "Kcur", il);
  11691. }
  11692. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  11693. cb(Vcur, "Vcur", il);
  11694. if (model.layers[il].bv) {
  11695. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  11696. cb(Vcur, "Vcur", il);
  11697. }
  11698. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
  11699. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
  11700. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
  11701. const bool use_rope = hparams.rope_finetuned;
  11702. if (use_rope) {
  11703. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  11704. Qcur = ggml_rope_ext(
  11705. ctx0, Qcur, inp_pos, rope_factors,
  11706. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11707. ext_factor, attn_factor, beta_fast, beta_slow
  11708. );
  11709. Kcur = ggml_rope_ext(
  11710. ctx0, Kcur, inp_pos, rope_factors,
  11711. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11712. ext_factor, attn_factor, beta_fast, beta_slow
  11713. );
  11714. }
  11715. cb(Qcur, "Qcur", il);
  11716. cb(Kcur, "Kcur", il);
  11717. cb(Vcur, "Vcur", il);
  11718. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  11719. cur = build_attn(inp_attn,
  11720. model.layers[il].wo, model.layers[il].bo,
  11721. Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
  11722. cb(cur, "attn_out", il);
  11723. return cur;
  11724. }
  11725. ggml_tensor * build_layer_ffn(
  11726. ggml_tensor * cur,
  11727. ggml_tensor * inpSA,
  11728. const llama_model & model,
  11729. const int il) {
  11730. // For Granite architectures - scale residual
  11731. if (hparams.f_residual_scale) {
  11732. cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
  11733. }
  11734. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  11735. cb(ffn_inp, "ffn_inp", il);
  11736. // feed-forward network (non-MoE)
  11737. if (model.layers[il].ffn_gate_inp == nullptr) {
  11738. cur = build_norm(ffn_inp,
  11739. model.layers[il].ffn_norm, NULL,
  11740. LLM_NORM_RMS, il);
  11741. cb(cur, "ffn_norm", il);
  11742. cur = build_ffn(cur,
  11743. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  11744. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  11745. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  11746. NULL,
  11747. LLM_FFN_SILU, LLM_FFN_PAR, il);
  11748. cb(cur, "ffn_out", il);
  11749. } else {
  11750. // MoE branch
  11751. cur = build_norm(ffn_inp,
  11752. model.layers[il].ffn_norm, NULL,
  11753. LLM_NORM_RMS, il);
  11754. cb(cur, "ffn_norm", il);
  11755. ggml_tensor * moe_out = build_moe_ffn(cur,
  11756. model.layers[il].ffn_gate_inp,
  11757. model.layers[il].ffn_up_exps,
  11758. model.layers[il].ffn_gate_exps,
  11759. model.layers[il].ffn_down_exps,
  11760. nullptr,
  11761. n_expert, n_expert_used,
  11762. LLM_FFN_SILU, true,
  11763. false, 0.0,
  11764. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  11765. il);
  11766. cb(moe_out, "ffn_moe_out", il);
  11767. // For Granite MoE Shared
  11768. if (hparams.n_ff_shexp > 0) {
  11769. ggml_tensor * ffn_shexp = build_ffn(cur,
  11770. model.layers[il].ffn_up_shexp, NULL, NULL,
  11771. model.layers[il].ffn_gate_shexp, NULL, NULL,
  11772. model.layers[il].ffn_down_shexp, NULL, NULL,
  11773. NULL,
  11774. LLM_FFN_SILU, LLM_FFN_PAR, il);
  11775. cb(ffn_shexp, "ffn_shexp", il);
  11776. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  11777. cb(cur, "ffn_out", il);
  11778. } else {
  11779. cur = moe_out;
  11780. }
  11781. }
  11782. // For Granite architectures - scale residual
  11783. if (hparams.f_residual_scale) {
  11784. cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
  11785. }
  11786. cur = ggml_add(ctx0, cur, ffn_inp);
  11787. cb(cur, "ffn_out", il);
  11788. cur = build_cvec(cur, il);
  11789. cb(cur, "l_out", il);
  11790. return cur;
  11791. }
  11792. };
  11793. // ref: https://github.com/facebookresearch/chameleon
  11794. // based on the original build_llama() function, changes:
  11795. // * qk-norm
  11796. // * swin-norm
  11797. // * removed bias
  11798. // * removed MoE
  11799. struct llm_build_chameleon : public llm_graph_context {
  11800. llm_build_chameleon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  11801. const int64_t n_embd_head = hparams.n_embd_head_v;
  11802. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  11803. GGML_ASSERT(n_embd_head == hparams.n_rot);
  11804. ggml_tensor * cur;
  11805. ggml_tensor * inpL;
  11806. inpL = build_inp_embd(model.tok_embd);
  11807. // inp_pos - contains the positions
  11808. ggml_tensor * inp_pos = build_inp_pos();
  11809. auto * inp_attn = build_attn_inp_kv_unified();
  11810. ggml_tensor * inp_out_ids = build_inp_out_ids();
  11811. for (int il = 0; il < n_layer; ++il) {
  11812. ggml_tensor * inpSA = inpL;
  11813. // norm
  11814. if (hparams.swin_norm) {
  11815. cur = inpL;
  11816. } else {
  11817. cur = build_norm(inpL,
  11818. model.layers[il].attn_norm, NULL,
  11819. LLM_NORM_RMS, il);
  11820. cb(cur, "attn_norm", il);
  11821. }
  11822. // self-attention
  11823. {
  11824. // compute Q and K and RoPE them
  11825. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  11826. cb(Qcur, "Qcur", il);
  11827. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  11828. cb(Kcur, "Kcur", il);
  11829. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  11830. cb(Vcur, "Vcur", il);
  11831. if (model.layers[il].attn_q_norm) {
  11832. Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
  11833. ggml_element_size(Qcur) * n_embd_head,
  11834. ggml_element_size(Qcur) * n_embd_head * n_head,
  11835. 0);
  11836. cb(Qcur, "Qcur", il);
  11837. Qcur = build_norm(Qcur,
  11838. model.layers[il].attn_q_norm,
  11839. model.layers[il].attn_q_norm_b,
  11840. LLM_NORM, il);
  11841. cb(Qcur, "Qcur", il);
  11842. }
  11843. if (model.layers[il].attn_k_norm) {
  11844. Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
  11845. ggml_element_size(Kcur) * n_embd_head,
  11846. ggml_element_size(Kcur) * n_embd_head * n_head_kv,
  11847. 0);
  11848. cb(Kcur, "Kcur", il);
  11849. Kcur = build_norm(Kcur,
  11850. model.layers[il].attn_k_norm,
  11851. model.layers[il].attn_k_norm_b,
  11852. LLM_NORM, il);
  11853. cb(Kcur, "Kcur", il);
  11854. }
  11855. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  11856. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  11857. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  11858. Qcur = ggml_rope_ext(
  11859. ctx0, Qcur, inp_pos, nullptr,
  11860. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11861. ext_factor, attn_factor, beta_fast, beta_slow
  11862. );
  11863. Kcur = ggml_rope_ext(
  11864. ctx0, Kcur, inp_pos, nullptr,
  11865. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11866. ext_factor, attn_factor, beta_fast, beta_slow
  11867. );
  11868. cb(Qcur, "Qcur", il);
  11869. cb(Kcur, "Kcur", il);
  11870. cb(Vcur, "Vcur", il);
  11871. cur = build_attn(inp_attn,
  11872. model.layers[il].wo, nullptr,
  11873. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  11874. }
  11875. if (il == n_layer - 1 && inp_out_ids) {
  11876. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11877. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  11878. }
  11879. if (hparams.swin_norm) {
  11880. cur = build_norm(cur,
  11881. model.layers[il].attn_norm, NULL,
  11882. LLM_NORM_RMS, il);
  11883. }
  11884. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  11885. cb(ffn_inp, "ffn_inp", il);
  11886. // feed-forward network
  11887. if (!hparams.swin_norm) {
  11888. cur = build_norm(ffn_inp,
  11889. model.layers[il].ffn_norm, NULL,
  11890. LLM_NORM_RMS, il);
  11891. cb(cur, "ffn_norm", il);
  11892. }
  11893. cur = build_ffn(cur,
  11894. model.layers[il].ffn_up, NULL, NULL,
  11895. model.layers[il].ffn_gate, NULL, NULL,
  11896. model.layers[il].ffn_down, NULL, NULL,
  11897. NULL,
  11898. LLM_FFN_SILU, LLM_FFN_PAR, il);
  11899. cb(cur, "ffn_out", il);
  11900. if (hparams.swin_norm) {
  11901. cur = build_norm(cur,
  11902. model.layers[il].ffn_norm, NULL,
  11903. LLM_NORM_RMS, il);
  11904. cb(cur, "ffn_norm", il);
  11905. }
  11906. cur = ggml_add(ctx0, cur, ffn_inp);
  11907. cb(cur, "ffn_out", il);
  11908. cur = build_cvec(cur, il);
  11909. cb(cur, "l_out", il);
  11910. // input for next layer
  11911. inpL = cur;
  11912. }
  11913. cur = inpL;
  11914. cur = build_norm(cur,
  11915. model.output_norm, NULL,
  11916. LLM_NORM_RMS, -1);
  11917. cb(cur, "result_norm", -1);
  11918. res->t_embd = cur;
  11919. // lm_head
  11920. cur = build_lora_mm(model.output, cur);
  11921. cb(cur, "result_output_with_img_logits", -1);
  11922. // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
  11923. // Needs to be removed once image outputs are supported.
  11924. int img_token_end_idx = 8196;
  11925. int img_token_start_idx = 4;
  11926. int num_img_tokens = img_token_end_idx - img_token_start_idx;
  11927. // creates 1d tensor of size num_img_tokens and values -FLT_MAX,
  11928. // which ensures that text token values are always at least larger than image token values
  11929. ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
  11930. img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
  11931. cb(img_logits, "img_logits", -1);
  11932. cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
  11933. cb(cur, "result_output", -1);
  11934. res->t_logits = cur;
  11935. ggml_build_forward_expand(gf, cur);
  11936. }
  11937. };
  11938. struct llm_build_wavtokenizer_dec : public llm_graph_context {
  11939. llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  11940. ggml_tensor * cur;
  11941. ggml_tensor * inpL;
  11942. inpL = build_inp_embd(model.tok_embd);
  11943. cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
  11944. cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
  11945. cur = ggml_add(ctx0, cur, model.conv1d_b);
  11946. // posnet
  11947. for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
  11948. const auto & layer = model.layers[il].posnet;
  11949. inpL = cur;
  11950. switch (il) {
  11951. case 0:
  11952. case 1:
  11953. case 3:
  11954. case 4:
  11955. {
  11956. cur = build_norm(cur,
  11957. layer.norm1,
  11958. layer.norm1_b,
  11959. LLM_NORM_GROUP, 0);
  11960. cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
  11961. cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
  11962. cur = ggml_add(ctx0, cur, layer.conv1_b);
  11963. cur = build_norm(cur,
  11964. layer.norm2,
  11965. layer.norm2_b,
  11966. LLM_NORM_GROUP, 0);
  11967. cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
  11968. cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
  11969. cur = ggml_add(ctx0, cur, layer.conv2_b);
  11970. cur = ggml_add(ctx0, cur, inpL);
  11971. } break;
  11972. case 2:
  11973. {
  11974. cur = build_norm(cur,
  11975. layer.attn_norm,
  11976. layer.attn_norm_b,
  11977. LLM_NORM_GROUP, 0);
  11978. ggml_tensor * q;
  11979. ggml_tensor * k;
  11980. ggml_tensor * v;
  11981. q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
  11982. k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
  11983. v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
  11984. q = ggml_add(ctx0, q, layer.attn_q_b);
  11985. k = ggml_add(ctx0, k, layer.attn_k_b);
  11986. v = ggml_add(ctx0, v, layer.attn_v_b);
  11987. q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
  11988. k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
  11989. ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
  11990. kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
  11991. cur = ggml_mul_mat(ctx0, kq, v);
  11992. cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
  11993. cur = ggml_add(ctx0, cur, layer.attn_o_b);
  11994. cur = ggml_add(ctx0, cur, inpL);
  11995. } break;
  11996. case 5:
  11997. {
  11998. cur = build_norm(cur,
  11999. layer.norm,
  12000. layer.norm_b,
  12001. LLM_NORM_GROUP, 0);
  12002. } break;
  12003. default: GGML_ABORT("unknown posnet layer");
  12004. };
  12005. }
  12006. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  12007. cur = build_norm(cur,
  12008. model.tok_norm,
  12009. model.tok_norm_b,
  12010. LLM_NORM, -1);
  12011. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  12012. inpL = cur;
  12013. // convnext
  12014. for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
  12015. const auto & layer = model.layers[il].convnext;
  12016. cur = inpL;
  12017. cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
  12018. cur = ggml_add(ctx0, cur, layer.dw_b);
  12019. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  12020. cur = build_norm(cur,
  12021. layer.norm,
  12022. layer.norm_b,
  12023. LLM_NORM, -1);
  12024. cur = build_ffn(cur,
  12025. layer.pw1, layer.pw1_b, NULL,
  12026. NULL, NULL, NULL,
  12027. layer.pw2, layer.pw2_b, NULL,
  12028. NULL,
  12029. LLM_FFN_GELU, LLM_FFN_SEQ, il);
  12030. cur = ggml_mul(ctx0, cur, layer.gamma);
  12031. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  12032. inpL = ggml_add(ctx0, cur, inpL);
  12033. }
  12034. cur = inpL;
  12035. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  12036. cur = build_norm(cur,
  12037. model.output_norm,
  12038. model.output_norm_b,
  12039. LLM_NORM, -1);
  12040. // lm_head
  12041. cur = build_lora_mm(model.output, cur);
  12042. cur = ggml_add(ctx0, cur, model.output_b);
  12043. cb(cur, "result_embd", -1);
  12044. res->t_embd = cur;
  12045. ggml_build_forward_expand(gf, cur);
  12046. }
  12047. };
  12048. struct llm_build_plm : public llm_graph_context {
  12049. llm_build_plm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  12050. const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
  12051. const uint32_t n_embd_head_qk_rope = hparams.n_rot;
  12052. const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  12053. const uint32_t kv_lora_rank = hparams.n_lora_kv;
  12054. ggml_tensor * cur;
  12055. ggml_tensor * inpL;
  12056. // {n_embd, n_tokens}
  12057. inpL = build_inp_embd(model.tok_embd);
  12058. // inp_pos - contains the positions
  12059. ggml_tensor * inp_pos = build_inp_pos();
  12060. auto * inp_attn = build_attn_inp_kv_unified();
  12061. ggml_tensor * inp_out_ids = build_inp_out_ids();
  12062. for (int il = 0; il < n_layer; ++il) {
  12063. ggml_tensor * inpSA = inpL;
  12064. // norm
  12065. cur = build_norm(inpL,
  12066. model.layers[il].attn_norm, NULL,
  12067. LLM_NORM_RMS, il);
  12068. cb(cur, "attn_norm", il);
  12069. // self_attention
  12070. {
  12071. ggml_tensor * q = NULL;
  12072. q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
  12073. cb(q, "q", il);
  12074. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  12075. ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
  12076. ggml_row_size(q->type, hparams.n_embd_head_k),
  12077. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  12078. 0);
  12079. cb(q_nope, "q_nope", il);
  12080. // and {n_head * n_embd_head_qk_rope, n_tokens}
  12081. ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
  12082. ggml_row_size(q->type, hparams.n_embd_head_k),
  12083. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  12084. ggml_row_size(q->type, n_embd_head_qk_nope));
  12085. cb(q_pe, "q_pe", il);
  12086. // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
  12087. ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
  12088. cb(kv_pe_compresseed, "kv_pe_compresseed", il);
  12089. // split into {kv_lora_rank, n_tokens}
  12090. ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
  12091. kv_pe_compresseed->nb[1],
  12092. 0);
  12093. cb(kv_compressed, "kv_compressed", il);
  12094. // and {n_embd_head_qk_rope, n_tokens}
  12095. ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
  12096. kv_pe_compresseed->nb[1],
  12097. kv_pe_compresseed->nb[1],
  12098. ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
  12099. cb(k_pe, "k_pe", il);
  12100. kv_compressed = build_norm(kv_compressed,
  12101. model.layers[il].attn_kv_a_norm, NULL,
  12102. LLM_NORM_RMS, il);
  12103. cb(kv_compressed, "kv_compressed", il);
  12104. // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
  12105. ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
  12106. cb(kv, "kv", il);
  12107. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  12108. ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
  12109. ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
  12110. ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  12111. 0);
  12112. cb(k_nope, "k_nope", il);
  12113. // and {n_head * n_embd_head_v, n_tokens}
  12114. ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
  12115. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  12116. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
  12117. ggml_row_size(kv->type, (n_embd_head_qk_nope)));
  12118. cb(v_states, "v_states", il);
  12119. v_states = ggml_cont(ctx0, v_states);
  12120. cb(v_states, "v_states", il);
  12121. v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
  12122. ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
  12123. 0);
  12124. cb(v_states, "v_states", il);
  12125. q_pe = ggml_rope_ext(
  12126. ctx0, q_pe, inp_pos, nullptr,
  12127. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12128. ext_factor, attn_factor, beta_fast, beta_slow
  12129. );
  12130. cb(q_pe, "q_pe", il);
  12131. // shared RoPE key
  12132. k_pe = ggml_rope_ext(
  12133. ctx0, k_pe, inp_pos, nullptr,
  12134. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12135. ext_factor, attn_factor, beta_fast, beta_slow
  12136. );
  12137. cb(k_pe, "k_pe", il);
  12138. ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
  12139. cb(q_states, "q_states", il);
  12140. ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
  12141. cb(k_states, "k_states", il);
  12142. cur = build_attn(inp_attn,
  12143. model.layers[il].wo, NULL,
  12144. q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
  12145. }
  12146. if (il == n_layer - 1 && inp_out_ids) {
  12147. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12148. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  12149. }
  12150. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  12151. cb(ffn_inp, "ffn_inp", il);
  12152. cur = build_norm(ffn_inp,
  12153. model.layers[il].ffn_norm, NULL,
  12154. LLM_NORM_RMS, il);
  12155. cb(cur, "ffn_norm", il);
  12156. cur = build_ffn(cur,
  12157. model.layers[il].ffn_up, NULL, NULL,
  12158. NULL, NULL, NULL,
  12159. model.layers[il].ffn_down, NULL, NULL,
  12160. NULL,
  12161. LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
  12162. cb(cur, "ffn_out", il);
  12163. cur = ggml_add(ctx0, cur, ffn_inp);
  12164. cur = build_cvec(cur, il);
  12165. cb(cur, "l_out", il);
  12166. // input for next layer
  12167. inpL = cur;
  12168. }
  12169. cur = inpL;
  12170. cur = build_norm(cur,
  12171. model.output_norm, NULL,
  12172. LLM_NORM_RMS, -1);
  12173. cb(cur, "result_norm", -1);
  12174. res->t_embd = cur;
  12175. cur = build_lora_mm(model.output, cur);
  12176. cb(cur, "result_output", -1);
  12177. res->t_logits = cur;
  12178. ggml_build_forward_expand(gf, cur);
  12179. }
  12180. };
  12181. struct llm_build_bailingmoe : public llm_graph_context {
  12182. llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  12183. ggml_tensor * cur;
  12184. ggml_tensor * inpL;
  12185. inpL = build_inp_embd(model.tok_embd);
  12186. // inp_pos - contains the positions
  12187. ggml_tensor * inp_pos = build_inp_pos();
  12188. auto * inp_attn = build_attn_inp_kv_unified();
  12189. ggml_tensor * inp_out_ids = build_inp_out_ids();
  12190. for (int il = 0; il < n_layer; ++il) {
  12191. ggml_tensor * inpSA = inpL;
  12192. // norm
  12193. cur = build_norm(inpL,
  12194. model.layers[il].attn_norm, NULL,
  12195. LLM_NORM_RMS, il);
  12196. cb(cur, "attn_norm", il);
  12197. // self-attention
  12198. {
  12199. // rope freq factors for llama3; may return nullptr for llama2 and other models
  12200. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  12201. // compute Q and K and RoPE them
  12202. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  12203. cb(Qcur, "Qcur", il);
  12204. if (model.layers[il].bq) {
  12205. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  12206. cb(Qcur, "Qcur", il);
  12207. }
  12208. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  12209. cb(Kcur, "Kcur", il);
  12210. if (model.layers[il].bk) {
  12211. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  12212. cb(Kcur, "Kcur", il);
  12213. }
  12214. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  12215. cb(Vcur, "Vcur", il);
  12216. if (model.layers[il].bv) {
  12217. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  12218. cb(Vcur, "Vcur", il);
  12219. }
  12220. Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
  12221. Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
  12222. Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
  12223. Qcur = ggml_rope_ext(
  12224. ctx0, Qcur, inp_pos, rope_factors,
  12225. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12226. ext_factor, attn_factor, beta_fast, beta_slow
  12227. );
  12228. Kcur = ggml_rope_ext(
  12229. ctx0, Kcur, inp_pos, rope_factors,
  12230. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12231. ext_factor, attn_factor, beta_fast, beta_slow
  12232. );
  12233. cb(Qcur, "Qcur", il);
  12234. cb(Kcur, "Kcur", il);
  12235. cb(Vcur, "Vcur", il);
  12236. cur = build_attn(inp_attn,
  12237. model.layers[il].wo, model.layers[il].bo,
  12238. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
  12239. }
  12240. if (il == n_layer - 1 && inp_out_ids) {
  12241. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12242. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  12243. }
  12244. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  12245. cb(ffn_inp, "ffn_inp", il);
  12246. cur = build_norm(ffn_inp,
  12247. model.layers[il].ffn_norm, NULL,
  12248. LLM_NORM_RMS, il);
  12249. cb(cur, "ffn_norm", il);
  12250. ggml_tensor * moe_out =
  12251. build_moe_ffn(cur,
  12252. model.layers[il].ffn_gate_inp,
  12253. model.layers[il].ffn_up_exps,
  12254. model.layers[il].ffn_gate_exps,
  12255. model.layers[il].ffn_down_exps,
  12256. nullptr,
  12257. n_expert, n_expert_used,
  12258. LLM_FFN_SILU, hparams.expert_weights_norm,
  12259. false, hparams.expert_weights_scale,
  12260. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  12261. il);
  12262. cb(moe_out, "ffn_moe_out", il);
  12263. // FFN shared expert
  12264. {
  12265. ggml_tensor * ffn_shexp = build_ffn(cur,
  12266. model.layers[il].ffn_up_shexp, NULL, NULL,
  12267. model.layers[il].ffn_gate_shexp, NULL, NULL,
  12268. model.layers[il].ffn_down_shexp, NULL, NULL,
  12269. NULL,
  12270. LLM_FFN_SILU, LLM_FFN_PAR, il);
  12271. cb(ffn_shexp, "ffn_shexp", il);
  12272. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  12273. cb(cur, "ffn_out", il);
  12274. }
  12275. cur = ggml_add(ctx0, cur, ffn_inp);
  12276. cur = build_cvec(cur, il);
  12277. cb(cur, "l_out", il);
  12278. // input for next layer
  12279. inpL = cur;
  12280. }
  12281. cur = inpL;
  12282. cur = build_norm(cur,
  12283. model.output_norm, NULL,
  12284. LLM_NORM_RMS, -1);
  12285. cb(cur, "result_norm", -1);
  12286. res->t_embd = cur;
  12287. // lm_head
  12288. cur = build_lora_mm(model.output, cur);
  12289. cb(cur, "result_output", -1);
  12290. res->t_logits = cur;
  12291. ggml_build_forward_expand(gf, cur);
  12292. }
  12293. };
  12294. struct llm_build_dots1 : public llm_graph_context {
  12295. llm_build_dots1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  12296. const int64_t n_embd_head = hparams.n_embd_head_v;
  12297. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  12298. GGML_ASSERT(n_embd_head == hparams.n_rot);
  12299. ggml_tensor * cur;
  12300. ggml_tensor * inpL;
  12301. inpL = build_inp_embd(model.tok_embd);
  12302. // inp_pos - contains the positions
  12303. ggml_tensor * inp_pos = build_inp_pos();
  12304. auto * inp_attn = build_attn_inp_kv_unified();
  12305. ggml_tensor * inp_out_ids = build_inp_out_ids();
  12306. for (int il = 0; il < n_layer; ++il) {
  12307. ggml_tensor * inpSA = inpL;
  12308. // norm
  12309. cur = build_norm(inpL,
  12310. model.layers[il].attn_norm, NULL,
  12311. LLM_NORM_RMS, il);
  12312. cb(cur, "attn_norm", il);
  12313. // self_attention
  12314. {
  12315. // compute Q and K and RoPE them
  12316. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  12317. cb(Qcur, "Qcur", il);
  12318. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  12319. cb(Kcur, "Kcur", il);
  12320. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  12321. cb(Vcur, "Vcur", il);
  12322. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  12323. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  12324. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  12325. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  12326. cb(Qcur, "Qcur_normed", il);
  12327. Qcur = ggml_rope_ext(
  12328. ctx0, Qcur, inp_pos, nullptr,
  12329. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12330. ext_factor, attn_factor, beta_fast, beta_slow
  12331. );
  12332. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  12333. cb(Kcur, "Kcur_normed", il);
  12334. Kcur = ggml_rope_ext(
  12335. ctx0, Kcur, inp_pos, nullptr,
  12336. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12337. ext_factor, attn_factor, beta_fast, beta_slow
  12338. );
  12339. cb(Qcur, "Qcur", il);
  12340. cb(Kcur, "Kcur", il);
  12341. cb(Vcur, "Vcur", il);
  12342. cur = build_attn(inp_attn,
  12343. model.layers[il].wo, model.layers[il].bo,
  12344. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  12345. }
  12346. if (il == n_layer - 1 && inp_out_ids) {
  12347. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12348. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  12349. }
  12350. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  12351. cb(ffn_inp, "ffn_inp", il);
  12352. // MoE branch
  12353. cur = build_norm(ffn_inp,
  12354. model.layers[il].ffn_norm, NULL,
  12355. LLM_NORM_RMS, il);
  12356. cb(cur, "ffn_norm", il);
  12357. if ((uint32_t) il < hparams.n_layer_dense_lead) {
  12358. cur = build_ffn(cur,
  12359. model.layers[il].ffn_up, NULL, NULL,
  12360. model.layers[il].ffn_gate, NULL, NULL,
  12361. model.layers[il].ffn_down, NULL, NULL,
  12362. NULL,
  12363. LLM_FFN_SILU, LLM_FFN_PAR, il);
  12364. cb(cur, "ffn_out", il);
  12365. } else {
  12366. ggml_tensor * moe_out =
  12367. build_moe_ffn(cur,
  12368. model.layers[il].ffn_gate_inp,
  12369. model.layers[il].ffn_up_exps,
  12370. model.layers[il].ffn_gate_exps,
  12371. model.layers[il].ffn_down_exps,
  12372. model.layers[il].ffn_exp_probs_b,
  12373. n_expert, n_expert_used,
  12374. LLM_FFN_SILU, hparams.expert_weights_norm,
  12375. true, hparams.expert_weights_scale,
  12376. (llama_expert_gating_func_type) hparams.expert_gating_func,
  12377. il);
  12378. cb(moe_out, "ffn_moe_out", il);
  12379. {
  12380. ggml_tensor * ffn_shexp = build_ffn(cur,
  12381. model.layers[il].ffn_up_shexp, NULL, NULL,
  12382. model.layers[il].ffn_gate_shexp, NULL, NULL,
  12383. model.layers[il].ffn_down_shexp, NULL, NULL,
  12384. NULL,
  12385. LLM_FFN_SILU, LLM_FFN_PAR, il);
  12386. cb(ffn_shexp, "ffn_shexp", il);
  12387. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  12388. cb(cur, "ffn_out", il);
  12389. }
  12390. }
  12391. cur = ggml_add(ctx0, cur, ffn_inp);
  12392. cur = build_cvec(cur, il);
  12393. cb(cur, "l_out", il);
  12394. // input for next layer
  12395. inpL = cur;
  12396. }
  12397. cur = inpL;
  12398. cur = build_norm(cur,
  12399. model.output_norm, NULL,
  12400. LLM_NORM_RMS, -1);
  12401. cb(cur, "result_norm", -1);
  12402. res->t_embd = cur;
  12403. // lm_head
  12404. cur = build_lora_mm(model.output, cur);
  12405. cb(cur, "result_output", -1);
  12406. res->t_logits = cur;
  12407. ggml_build_forward_expand(gf, cur);
  12408. }
  12409. };
  12410. struct llm_build_ernie4_5 : public llm_graph_context {
  12411. llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  12412. const int64_t n_embd_head = hparams.n_embd_head_v;
  12413. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  12414. GGML_ASSERT(n_embd_head == hparams.n_rot);
  12415. ggml_tensor * cur;
  12416. ggml_tensor * inpL;
  12417. inpL = build_inp_embd(model.tok_embd);
  12418. // inp_pos - contains the positions
  12419. ggml_tensor * inp_pos = build_inp_pos();
  12420. auto * inp_attn = build_attn_inp_kv_unified();
  12421. for (int il = 0; il < n_layer; ++il) {
  12422. ggml_tensor * inpSA = inpL;
  12423. // norm
  12424. {
  12425. cur = build_norm(inpL,
  12426. model.layers[il].attn_norm, NULL,
  12427. LLM_NORM_RMS, il);
  12428. cb(cur, "attn_norm", il);
  12429. }
  12430. // self-attention
  12431. {
  12432. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  12433. cb(Qcur, "Qcur", il);
  12434. if (model.layers[il].bq) {
  12435. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  12436. cb(Qcur, "Qcur", il);
  12437. }
  12438. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  12439. cb(Kcur, "Kcur", il);
  12440. if (model.layers[il].bk) {
  12441. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  12442. cb(Kcur, "Kcur", il);
  12443. }
  12444. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  12445. cb(Vcur, "Vcur", il);
  12446. if (model.layers[il].bv) {
  12447. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  12448. cb(Vcur, "Vcur", il);
  12449. }
  12450. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  12451. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  12452. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  12453. Qcur = ggml_rope_ext(
  12454. ctx0, Qcur, inp_pos, nullptr,
  12455. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12456. ext_factor, attn_factor, beta_fast, beta_slow
  12457. );
  12458. Kcur = ggml_rope_ext(
  12459. ctx0, Kcur, inp_pos, nullptr,
  12460. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12461. ext_factor, attn_factor, beta_fast, beta_slow
  12462. );
  12463. cb(Qcur, "Qcur", il);
  12464. cb(Kcur, "Kcur", il);
  12465. cb(Vcur, "Vcur", il);
  12466. cur = build_attn(inp_attn,
  12467. model.layers[il].wo, NULL,
  12468. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  12469. }
  12470. if (il == n_layer - 1) {
  12471. // skip computing output for unused tokens
  12472. ggml_tensor * inp_out_ids = build_inp_out_ids();
  12473. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12474. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  12475. }
  12476. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  12477. cb(ffn_inp, "ffn_inp", il);
  12478. // feed-forward network
  12479. {
  12480. cur = build_norm(ffn_inp,
  12481. model.layers[il].ffn_norm, NULL,
  12482. LLM_NORM_RMS, il);
  12483. cb(cur, "ffn_norm", il);
  12484. cur = build_ffn(cur,
  12485. model.layers[il].ffn_up, NULL, NULL,
  12486. model.layers[il].ffn_gate, NULL, NULL,
  12487. model.layers[il].ffn_down, NULL, NULL,
  12488. NULL,
  12489. LLM_FFN_SILU, LLM_FFN_PAR, il);
  12490. cb(cur, "ffn_out", il);
  12491. }
  12492. cur = ggml_add(ctx0, cur, ffn_inp);
  12493. cur = build_cvec(cur, il);
  12494. cb(cur, "l_out", il);
  12495. // input for next layer
  12496. inpL = cur;
  12497. }
  12498. cur = inpL;
  12499. cur = build_norm(cur,
  12500. model.output_norm, NULL,
  12501. LLM_NORM_RMS, -1);
  12502. cb(cur, "result_norm", -1);
  12503. res->t_embd = cur;
  12504. // lm_head
  12505. cur = build_lora_mm(model.output, cur);
  12506. cb(cur, "result_output", -1);
  12507. res->t_logits = cur;
  12508. ggml_build_forward_expand(gf, cur);
  12509. }
  12510. };
  12511. struct llm_build_ernie4_5_moe : public llm_graph_context {
  12512. llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  12513. const int64_t n_embd_head = hparams.n_embd_head_v;
  12514. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  12515. GGML_ASSERT(n_embd_head == hparams.n_rot);
  12516. ggml_tensor * cur;
  12517. ggml_tensor * inpL;
  12518. inpL = build_inp_embd(model.tok_embd);
  12519. // inp_pos - contains the positions
  12520. ggml_tensor * inp_pos = build_inp_pos();
  12521. auto * inp_attn = build_attn_inp_kv_unified();
  12522. ggml_tensor * inp_out_ids = build_inp_out_ids();
  12523. GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Ernie 4.5 MoE requires n_moe_layer_step > 0");
  12524. for (int il = 0; il < n_layer; ++il) {
  12525. ggml_tensor * inpSA = inpL;
  12526. // norm
  12527. {
  12528. cur = build_norm(inpL,
  12529. model.layers[il].attn_norm, NULL,
  12530. LLM_NORM_RMS, il);
  12531. cb(cur, "attn_norm", il);
  12532. }
  12533. // self-attention
  12534. {
  12535. // compute Q and K and RoPE them
  12536. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  12537. cb(Qcur, "Qcur", il);
  12538. if (model.layers[il].bq) {
  12539. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  12540. cb(Qcur, "Qcur", il);
  12541. }
  12542. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  12543. cb(Kcur, "Kcur", il);
  12544. if (model.layers[il].bk) {
  12545. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  12546. cb(Kcur, "Kcur", il);
  12547. }
  12548. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  12549. cb(Vcur, "Vcur", il);
  12550. if (model.layers[il].bv) {
  12551. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  12552. cb(Vcur, "Vcur", il);
  12553. }
  12554. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  12555. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  12556. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  12557. Qcur = ggml_rope_ext(
  12558. ctx0, Qcur, inp_pos, nullptr,
  12559. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12560. ext_factor, attn_factor, beta_fast, beta_slow
  12561. );
  12562. Kcur = ggml_rope_ext(
  12563. ctx0, Kcur, inp_pos, nullptr,
  12564. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12565. ext_factor, attn_factor, beta_fast, beta_slow
  12566. );
  12567. cb(Qcur, "Qcur", il);
  12568. cb(Kcur, "Kcur", il);
  12569. cb(Vcur, "Vcur", il);
  12570. cur = build_attn(inp_attn,
  12571. model.layers[il].wo, NULL,
  12572. Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  12573. cb(cur, "attn_out", il);
  12574. }
  12575. if (il == n_layer - 1 && inp_out_ids) {
  12576. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12577. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  12578. }
  12579. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  12580. cb(ffn_inp, "ffn_inp", il);
  12581. // feed-forward network
  12582. bool is_moe_layer = static_cast<uint32_t>(il) >= hparams.n_layer_dense_lead && (il + 1) % hparams.n_moe_layer_step == 0;
  12583. if (!is_moe_layer) {
  12584. cur = build_norm(ffn_inp,
  12585. model.layers[il].ffn_norm, NULL,
  12586. LLM_NORM_RMS, il);
  12587. cb(cur, "ffn_norm", il);
  12588. cur = build_ffn(cur,
  12589. model.layers[il].ffn_up, NULL, NULL,
  12590. model.layers[il].ffn_gate, NULL, NULL,
  12591. model.layers[il].ffn_down, NULL, NULL,
  12592. NULL,
  12593. LLM_FFN_SILU, LLM_FFN_PAR, il);
  12594. cb(cur, "ffn_out", il);
  12595. } else {
  12596. // MoE branch
  12597. cur = build_norm(ffn_inp,
  12598. model.layers[il].ffn_norm, NULL,
  12599. LLM_NORM_RMS, il);
  12600. cb(cur, "ffn_norm", il);
  12601. ggml_tensor * moe_out = build_moe_ffn(cur,
  12602. model.layers[il].ffn_gate_inp,
  12603. model.layers[il].ffn_up_exps,
  12604. model.layers[il].ffn_gate_exps,
  12605. model.layers[il].ffn_down_exps,
  12606. model.layers[il].ffn_exp_probs_b,
  12607. n_expert, n_expert_used,
  12608. LLM_FFN_SILU, true,
  12609. false, 0.0,
  12610. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  12611. il);
  12612. cb(moe_out, "ffn_moe_out", il);
  12613. // Shared expert (if present)
  12614. if (hparams.n_ff_shexp > 0) {
  12615. ggml_tensor * ffn_shexp = build_ffn(cur,
  12616. model.layers[il].ffn_up_shexp, NULL, NULL,
  12617. model.layers[il].ffn_gate_shexp, NULL, NULL,
  12618. model.layers[il].ffn_down_shexp, NULL, NULL,
  12619. NULL,
  12620. LLM_FFN_SILU, LLM_FFN_PAR, il);
  12621. cb(ffn_shexp, "ffn_shexp", il);
  12622. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  12623. } else {
  12624. cur = moe_out;
  12625. }
  12626. cb(cur, "ffn_out", il);
  12627. }
  12628. cur = ggml_add(ctx0, cur, ffn_inp);
  12629. cb(cur, "ffn_out", il);
  12630. cur = build_cvec(cur, il);
  12631. cb(cur, "l_out", il);
  12632. // input for next layer
  12633. inpL = cur;
  12634. }
  12635. cur = inpL;
  12636. cur = build_norm(cur,
  12637. model.output_norm, NULL,
  12638. LLM_NORM_RMS, -1);
  12639. cb(cur, "result_norm", -1);
  12640. res->t_embd = cur;
  12641. // lm_head
  12642. cur = build_lora_mm(model.output, cur);
  12643. cb(cur, "result_output", -1);
  12644. res->t_logits = cur;
  12645. ggml_build_forward_expand(gf, cur);
  12646. }
  12647. };
  12648. struct llm_build_falcon_h1 : public llm_graph_context_mamba {
  12649. llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
  12650. const int64_t n_embd_head = hparams.n_embd_head_v;
  12651. ggml_tensor * cur;
  12652. ggml_tensor * inpL;
  12653. inpL = build_inp_embd(model.tok_embd);
  12654. // inp_pos - contains the positions
  12655. ggml_tensor * inp_pos = build_inp_pos();
  12656. // Build the inputs in the recurrent & kv cache
  12657. auto * inp = build_inp_mem_hybrid();
  12658. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  12659. ggml_tensor * inp_out_ids = build_inp_out_ids();
  12660. for (int il = 0; il < n_layer; ++il) {
  12661. ggml_tensor * inpSA = inpL;
  12662. cur = build_norm(inpL,
  12663. model.layers[il].attn_norm, NULL,
  12664. LLM_NORM_RMS, il);
  12665. cb(cur, "attn_norm", il);
  12666. // self-attention
  12667. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  12668. cb(Qcur, "Qcur", il);
  12669. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  12670. cb(Kcur, "Kcur", il);
  12671. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  12672. cb(Vcur, "Vcur", il);
  12673. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  12674. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  12675. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  12676. Qcur = ggml_rope_ext(
  12677. ctx0, Qcur, inp_pos, nullptr,
  12678. n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
  12679. ext_factor, attn_factor, beta_fast, beta_slow);
  12680. Kcur = ggml_rope_ext(
  12681. ctx0, Kcur, inp_pos, nullptr,
  12682. n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
  12683. ext_factor, attn_factor, beta_fast, beta_slow
  12684. );
  12685. cb(Qcur, "Qcur-post-rope", il);
  12686. cb(Kcur, "Kcur-post-rope", il);
  12687. cb(Vcur, "Vcur-post-rope", il);
  12688. ggml_tensor * attn_out = build_attn(inp->get_attn(),
  12689. model.layers[il].wo, NULL,
  12690. Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
  12691. cb(attn_out, "attn_out", il);
  12692. cur = build_norm(inpL,
  12693. model.layers[il].attn_norm, NULL,
  12694. LLM_NORM_RMS, il);
  12695. // Mamba2 layer
  12696. cb(cur, "ssm_in", il);
  12697. ggml_tensor * ssm_out = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
  12698. cb(ssm_out, "ssm_out", il);
  12699. // // Aggregation
  12700. cur = ggml_add(ctx0, attn_out, ssm_out);
  12701. inpSA = ggml_add(ctx0, cur, inpSA);
  12702. cb(cur, "layer_out", il);
  12703. if (il == n_layer - 1 && inp_out_ids) {
  12704. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12705. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  12706. }
  12707. ggml_tensor * ffn_inp = inpSA;
  12708. cb(ffn_inp, "ffn_inp", il);
  12709. // feed-forward network
  12710. cur = build_norm(ffn_inp,
  12711. model.layers[il].ffn_norm, NULL,
  12712. LLM_NORM_RMS, il);
  12713. cb(cur, "ffn_norm", il);
  12714. cur = build_ffn(cur,
  12715. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  12716. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  12717. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  12718. NULL,
  12719. LLM_FFN_SILU, LLM_FFN_PAR, il);
  12720. cb(cur, "ffn_out", il);
  12721. cur = ggml_add(ctx0, cur, inpSA);
  12722. cur = build_cvec(cur, il);
  12723. cb(cur, "l_out", il);
  12724. // input for next layer
  12725. inpL = cur;
  12726. }
  12727. cur = inpL;
  12728. cur = build_norm(cur,
  12729. model.output_norm, NULL,
  12730. LLM_NORM_RMS, -1);
  12731. cb(cur, "result_norm", -1);
  12732. res->t_embd = cur;
  12733. // lm_head
  12734. cur = build_lora_mm(model.output, cur);
  12735. cb(cur, "result_output", -1);
  12736. res->t_logits = cur;
  12737. ggml_build_forward_expand(gf, cur);
  12738. }
  12739. };
  12740. struct llm_build_plamo2 : public llm_graph_context_mamba {
  12741. llm_build_plamo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
  12742. ggml_tensor * cur;
  12743. ggml_tensor * inpL;
  12744. // {n_embd, n_tokens}
  12745. inpL = build_inp_embd(model.tok_embd);
  12746. cb(inpL, "embedding_output", -1);
  12747. ggml_tensor * inp_pos = build_inp_pos();
  12748. auto * inp_hybrid = build_inp_mem_hybrid();
  12749. ggml_tensor * inp_out_ids = build_inp_out_ids();
  12750. for (int il = 0; il < n_layer; ++il) {
  12751. ggml_tensor * residual = inpL;
  12752. // ggml_graph_add_node(gf, model.layers[il].attn_norm);
  12753. // cb(model.layers[il].attn_norm, "attn_norm", il);
  12754. // pre_mixer_norm
  12755. cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  12756. // check if this layer is Mamba or Attention
  12757. bool is_mamba_layer = hparams.is_recurrent(il);
  12758. if (is_mamba_layer) {
  12759. // PLaMo-2 Mamba layer
  12760. cur = build_plamo2_mamba_layer(inp_hybrid->get_recr(), cur, model, ubatch, il);
  12761. } else {
  12762. // PLaMo-2 Attention layer
  12763. cur = build_plamo2_attn_layer(inp_hybrid->get_attn(), inp_pos, cur, model, il);
  12764. }
  12765. // post_mixer_norm
  12766. cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
  12767. cb(cur, "attn_post_norm", il);
  12768. // residual connection
  12769. cur = ggml_add(ctx0, cur, residual);
  12770. cb(cur, "attn_residual", il);
  12771. residual = cur;
  12772. // pre-ffn norm
  12773. cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
  12774. cb(cur, "ffn_pre_norm", il);
  12775. // feed-forward network
  12776. cur = build_ffn(cur,
  12777. model.layers[il].ffn_up, NULL, NULL,
  12778. NULL, NULL, NULL,
  12779. model.layers[il].ffn_down, NULL, NULL,
  12780. NULL,
  12781. LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
  12782. cb(cur, "ffn_out", il);
  12783. // post ffn norm
  12784. cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
  12785. cb(cur, "ffn_post_norm", il);
  12786. if (il == n_layer - 1 && inp_out_ids) {
  12787. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12788. residual = ggml_get_rows(ctx0, residual, inp_out_ids);
  12789. }
  12790. // residual connection
  12791. cur = ggml_add(ctx0, cur, residual);
  12792. cb(cur, "ffn_residual", il);
  12793. inpL = cur;
  12794. }
  12795. cur = inpL;
  12796. // final norm
  12797. cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
  12798. cb(cur, "result_norm", -1);
  12799. // lm_head
  12800. cur = build_lora_mm(model.output, cur);
  12801. cb(cur, "result_output", -1);
  12802. // Explicitly mark as output tensor to ensure proper backend assignment
  12803. ggml_set_output(cur);
  12804. res->t_logits = cur;
  12805. ggml_build_forward_expand(gf, cur);
  12806. }
  12807. private:
  12808. ggml_tensor * build_plamo2_attn_layer(
  12809. llm_graph_input_attn_kv_unified * inp,
  12810. ggml_tensor * inp_pos,
  12811. ggml_tensor * cur,
  12812. const llama_model & model,
  12813. int il) {
  12814. // self-attention
  12815. {
  12816. // PLaMo-2 uses combined QKV tensor
  12817. ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
  12818. cb(qkv, "qkv", il);
  12819. // split QKV tensor into Q, K, V
  12820. const int64_t n_embd_head_q = hparams.n_embd_head_k;
  12821. const int64_t n_embd_head_k = hparams.n_embd_head_k;
  12822. const int64_t n_embd_head_v = hparams.n_embd_head_v;
  12823. int32_t n_head_kv = hparams.n_head_kv(il);
  12824. const int64_t q_offset = 0;
  12825. const int64_t k_offset = n_embd_head_q * n_head;
  12826. const int64_t v_offset = k_offset + n_embd_head_k * n_head_kv;
  12827. ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
  12828. ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
  12829. ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv)));
  12830. cb(Qcur, "Qcur", il);
  12831. cb(Kcur, "Kcur", il);
  12832. cb(Vcur, "Vcur", il);
  12833. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
  12834. Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  12835. cb(Qcur, "Qcur_normed", il);
  12836. Qcur = ggml_rope_ext(
  12837. ctx0, Qcur, inp_pos, nullptr,
  12838. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12839. ext_factor, attn_factor, beta_fast, beta_slow
  12840. );
  12841. Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  12842. cb(Kcur, "Kcur_normed", il);
  12843. Kcur = ggml_rope_ext(
  12844. ctx0, Kcur, inp_pos, nullptr,
  12845. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12846. ext_factor, attn_factor, beta_fast, beta_slow
  12847. );
  12848. cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f, il);
  12849. }
  12850. cb(cur, "attn_out", il);
  12851. return cur;
  12852. }
  12853. ggml_tensor * build_plamo2_mamba_layer(
  12854. llm_graph_input_rs * inp,
  12855. ggml_tensor * cur,
  12856. const llama_model & model,
  12857. const llama_ubatch & ubatch,
  12858. int il) {
  12859. const auto * mctx_cur = inp->mctx;
  12860. const auto kv_head = mctx_cur->get_head();
  12861. const int64_t d_conv = hparams.ssm_d_conv;
  12862. const int64_t d_inner = hparams.ssm_d_inner;
  12863. const int64_t d_state = hparams.ssm_d_state;
  12864. const int64_t n_heads = hparams.ssm_dt_rank;
  12865. const int64_t head_dim = d_inner / n_heads;
  12866. const int64_t n_group = hparams.ssm_n_group;
  12867. const int64_t n_seqs = ubatch.n_seqs;
  12868. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  12869. GGML_ASSERT(n_seqs != 0);
  12870. GGML_ASSERT(ubatch.equal_seqs());
  12871. GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
  12872. ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
  12873. ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
  12874. ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
  12875. conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
  12876. // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
  12877. cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
  12878. // in_proj: {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
  12879. ggml_tensor * zx = build_lora_mm(model.layers[il].ssm_in, cur);
  12880. cb(zx, "mamba_in_proj", il);
  12881. // {8192, 5, 1, 1} -> {8192, 1, 5, 1}
  12882. zx = ggml_permute(ctx0, zx, 0, 2, 1, 3);
  12883. zx = ggml_cont(ctx0, zx);
  12884. zx = ggml_reshape_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
  12885. cb(zx, "mamba_in_proj_out", il);
  12886. // split into z and x
  12887. // => {head_dim * n_heads, n_seq_tokens, n_seqs}
  12888. ggml_tensor * x = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], head_dim*ggml_element_size(zx));
  12889. x = ggml_cont(ctx0, x);
  12890. x = ggml_reshape_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
  12891. // x = ggml_permute(ctx0, x, 0, 2, 1, 3);
  12892. cb(x, "mamba_x_split", il);
  12893. ggml_tensor * z = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], 0);
  12894. cb(z, "mamba_z_split", il);
  12895. // conv1d
  12896. {
  12897. // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
  12898. ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
  12899. cb(conv_x, "mamba_conv1d_input", il);
  12900. // copy last (d_conv - 1) columns back into the state cache
  12901. ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs,
  12902. conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
  12903. ggml_build_forward_expand(gf,
  12904. ggml_cpy(ctx0, last_conv,
  12905. ggml_view_1d(ctx0, conv_states_all,
  12906. (d_conv - 1)*(d_inner)*(n_seqs),
  12907. kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
  12908. // 1D convolution
  12909. x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
  12910. cb(x, "mamba_conv1d", il);
  12911. x = ggml_silu(ctx0, x);
  12912. cb(x, "mamba_conv1d_silu", il);
  12913. }
  12914. // SSM
  12915. {
  12916. // bcdt_proj: {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
  12917. ggml_tensor * x_bcdt = build_lora_mm(model.layers[il].ssm_x, x);
  12918. cb(x_bcdt, "mamba_bcdt_proj", il);
  12919. // split into dt, B, C
  12920. const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
  12921. ggml_tensor * B = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], 0);
  12922. ggml_tensor * C = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], ggml_element_size(x_bcdt)*d_state);
  12923. ggml_tensor * dt = ggml_view_3d(ctx0, x_bcdt, dt_dim, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], ggml_element_size(x_bcdt)*(2*d_state));
  12924. cb(B, "mamba_B_raw", il);
  12925. cb(C, "mamba_C_raw", il);
  12926. cb(dt, "mamba_dt_raw", il);
  12927. // Apply RMS norm to dt, B, C (PLaMo-2 specific)
  12928. B = build_norm(B, model.layers[il].ssm_b_norm, NULL, LLM_NORM_RMS, il);
  12929. C = build_norm(C, model.layers[il].ssm_c_norm, NULL, LLM_NORM_RMS, il);
  12930. dt = build_norm(dt, model.layers[il].ssm_dt_norm, NULL, LLM_NORM_RMS, il);
  12931. cb(B, "mamba_B_normed", il);
  12932. cb(C, "mamba_C_normed", il);
  12933. cb(dt, "mamba_dt_normed", il);
  12934. // dt_proj: {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
  12935. dt = build_lora_mm(model.layers[il].ssm_dt, dt);
  12936. dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
  12937. cb(dt, "mamba_dt_proj", il);
  12938. ggml_tensor * A = ggml_reshape_2d(ctx0, model.layers[il].ssm_a, 1, n_heads);
  12939. cb(A, "mamba_A", il);
  12940. x = ggml_view_4d(ctx0, x, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
  12941. B = ggml_view_4d(ctx0, B, d_state, 1, n_seq_tokens, n_seqs, d_state * B->nb[0], B->nb[1], B->nb[2], 0);
  12942. C = ggml_view_4d(ctx0, C, d_state, 1, n_seq_tokens, n_seqs, d_state * C->nb[0], C->nb[1], C->nb[2], 0);
  12943. // use the states and the indices provided by build_recurrent_state
  12944. // (this is necessary in order to properly use the states before they are overwritten,
  12945. // while avoiding to make unnecessary copies of the states)
  12946. auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
  12947. ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_heads, mctx_cur->get_size());
  12948. // Custom operator to optimize the parallel associative scan
  12949. // as described in the Annex D of the Mamba paper.
  12950. // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
  12951. return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
  12952. };
  12953. ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
  12954. cb(y_ssm, "mamba_ssm_scan", il);
  12955. // store last states
  12956. ggml_build_forward_expand(gf,
  12957. ggml_cpy(ctx0,
  12958. ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]),
  12959. ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs,
  12960. kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
  12961. ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
  12962. cb(y, "mamba_y_view", il);
  12963. // Add D parameter and apply gating with z
  12964. // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
  12965. ggml_tensor * D = ggml_reshape_2d(ctx0, model.layers[il].ssm_d, 1, n_heads);
  12966. y = ggml_add(ctx0, y, ggml_mul(ctx0, x, D));
  12967. cb(y, "mamba_y_add_d", il);
  12968. y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
  12969. cb(y, "mamba_y_swiglu_z", il);
  12970. // out_proj: {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
  12971. y = ggml_view_3d(ctx0, y, head_dim * n_heads, n_seq_tokens, n_seqs, y->nb[2], y->nb[3], 0);
  12972. cur = build_lora_mm(model.layers[il].ssm_out, y);
  12973. cb(cur, "mamba_out_proj", il);
  12974. }
  12975. // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
  12976. cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
  12977. cb(cur, "mamba_out", il);
  12978. return cur;
  12979. }
  12980. };
  12981. struct llm_build_arcee : public llm_graph_context {
  12982. llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  12983. const int64_t n_embd_head = hparams.n_embd_head_v;
  12984. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  12985. GGML_ASSERT(n_embd_head == hparams.n_rot);
  12986. ggml_tensor * cur;
  12987. ggml_tensor * inpL;
  12988. inpL = build_inp_embd(model.tok_embd);
  12989. // inp_pos - contains the positions
  12990. ggml_tensor * inp_pos = build_inp_pos();
  12991. auto * inp_attn = build_attn_inp_kv_unified();
  12992. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  12993. ggml_tensor * inp_out_ids = build_inp_out_ids();
  12994. for (int il = 0; il < n_layer; ++il) {
  12995. ggml_tensor * inpSA = inpL;
  12996. // norm
  12997. cur = build_norm(inpL,
  12998. model.layers[il].attn_norm, NULL,
  12999. LLM_NORM_RMS, il);
  13000. cb(cur, "attn_norm", il);
  13001. // self-attention
  13002. {
  13003. // rope freq factors for llama3; may return nullptr for llama2 and other models
  13004. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  13005. // compute Q and K and RoPE them
  13006. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  13007. cb(Qcur, "Qcur", il);
  13008. if (model.layers[il].bq) {
  13009. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  13010. cb(Qcur, "Qcur", il);
  13011. }
  13012. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  13013. cb(Kcur, "Kcur", il);
  13014. if (model.layers[il].bk) {
  13015. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  13016. cb(Kcur, "Kcur", il);
  13017. }
  13018. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  13019. cb(Vcur, "Vcur", il);
  13020. if (model.layers[il].bv) {
  13021. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  13022. cb(Vcur, "Vcur", il);
  13023. }
  13024. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  13025. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  13026. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  13027. Qcur = ggml_rope_ext(
  13028. ctx0, Qcur, inp_pos, rope_factors,
  13029. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13030. ext_factor, attn_factor, beta_fast, beta_slow
  13031. );
  13032. Kcur = ggml_rope_ext(
  13033. ctx0, Kcur, inp_pos, rope_factors,
  13034. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13035. ext_factor, attn_factor, beta_fast, beta_slow
  13036. );
  13037. cb(Qcur, "Qcur", il);
  13038. cb(Kcur, "Kcur", il);
  13039. cb(Vcur, "Vcur", il);
  13040. cur = build_attn(inp_attn,
  13041. model.layers[il].wo, model.layers[il].bo,
  13042. Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
  13043. cb(cur, "attn_out", il);
  13044. }
  13045. if (il == n_layer - 1 && inp_out_ids) {
  13046. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  13047. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  13048. }
  13049. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  13050. cb(ffn_inp, "ffn_inp", il);
  13051. // feed-forward network
  13052. // ARCEE uses relu^2 instead of silu
  13053. cur = build_norm(ffn_inp,
  13054. model.layers[il].ffn_norm, NULL,
  13055. LLM_NORM_RMS, il);
  13056. cb(cur, "ffn_norm", il);
  13057. cur = build_ffn(cur,
  13058. model.layers[il].ffn_up, NULL, NULL,
  13059. NULL, NULL, NULL,
  13060. model.layers[il].ffn_down, NULL, NULL,
  13061. NULL,
  13062. LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
  13063. cb(cur, "ffn_out", il);
  13064. cur = ggml_add(ctx0, cur, ffn_inp);
  13065. cb(cur, "ffn_out", il);
  13066. cur = build_cvec(cur, il);
  13067. cb(cur, "l_out", il);
  13068. // input for next layer
  13069. inpL = cur;
  13070. }
  13071. cur = inpL;
  13072. cur = build_norm(cur,
  13073. model.output_norm, NULL,
  13074. LLM_NORM_RMS, -1);
  13075. cb(cur, "result_norm", -1);
  13076. res->t_embd = cur;
  13077. // lm_head
  13078. cur = build_lora_mm(model.output, cur);
  13079. cb(cur, "result_output", -1);
  13080. res->t_logits = cur;
  13081. ggml_build_forward_expand(gf, cur);
  13082. }
  13083. };
  13084. struct llm_build_hunyuan_moe : public llm_graph_context {
  13085. llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  13086. const int64_t n_embd_head = hparams.n_embd_head_v;
  13087. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  13088. GGML_ASSERT(n_embd_head == hparams.n_rot);
  13089. ggml_tensor * cur;
  13090. ggml_tensor * inpL;
  13091. inpL = build_inp_embd(model.tok_embd);
  13092. // inp_pos - contains the positions
  13093. ggml_tensor * inp_pos = build_inp_pos();
  13094. auto * inp_attn = build_attn_inp_kv_unified();
  13095. const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
  13096. ggml_tensor * inp_out_ids = build_inp_out_ids();
  13097. for (int il = 0; il < n_layer; ++il) {
  13098. ggml_tensor * inpSA = inpL;
  13099. // norm
  13100. cur = build_norm(inpL,
  13101. model.layers[il].attn_norm, NULL,
  13102. LLM_NORM_RMS, il);
  13103. cb(cur, "attn_norm", il);
  13104. // self-attention
  13105. {
  13106. // rope freq factors for llama3; may return nullptr for llama2 and other models
  13107. ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
  13108. // compute Q and K and RoPE them
  13109. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  13110. cb(Qcur, "Qcur", il);
  13111. if (model.layers[il].bq) {
  13112. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  13113. cb(Qcur, "Qcur", il);
  13114. }
  13115. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  13116. cb(Kcur, "Kcur", il);
  13117. if (model.layers[il].bk) {
  13118. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  13119. cb(Kcur, "Kcur", il);
  13120. }
  13121. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  13122. cb(Vcur, "Vcur", il);
  13123. if (model.layers[il].bv) {
  13124. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  13125. cb(Vcur, "Vcur", il);
  13126. }
  13127. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  13128. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  13129. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  13130. Qcur = ggml_rope_ext(
  13131. ctx0, Qcur, inp_pos, rope_factors,
  13132. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13133. ext_factor, attn_factor, beta_fast, beta_slow
  13134. );
  13135. cb(Qcur, "Qcur", il);
  13136. cb(Kcur, "Kcur", il);
  13137. cb(Vcur, "Vcur", il);
  13138. Kcur = ggml_rope_ext(
  13139. ctx0, Kcur, inp_pos, rope_factors,
  13140. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13141. ext_factor, attn_factor, beta_fast, beta_slow
  13142. );
  13143. Kcur = build_norm(Kcur,
  13144. model.layers[il].attn_k_norm, nullptr,
  13145. LLM_NORM_RMS, il);
  13146. cb(Kcur, "Kcur_norm", il);
  13147. Qcur = build_norm(Qcur,
  13148. model.layers[il].attn_q_norm, nullptr,
  13149. LLM_NORM_RMS, il);
  13150. cb(Qcur, "Qcur_norm", il);
  13151. cur = build_attn(inp_attn,
  13152. model.layers[il].wo, model.layers[il].bo,
  13153. Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
  13154. cb(cur, "attn_out", il);
  13155. }
  13156. if (il == n_layer - 1 && inp_out_ids) {
  13157. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  13158. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  13159. }
  13160. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  13161. cb(ffn_inp, "ffn_inp", il);
  13162. cur = build_norm(ffn_inp,
  13163. model.layers[il].ffn_norm, NULL,
  13164. LLM_NORM_RMS, il);
  13165. cb(cur, "ffn_norm", il);
  13166. // feed-forward network (non-MoE)
  13167. ggml_tensor * cur_mlp = build_ffn(cur,
  13168. model.layers[il].ffn_up_shexp, NULL, NULL,
  13169. model.layers[il].ffn_gate_shexp, NULL, NULL,
  13170. model.layers[il].ffn_down_shexp, NULL, NULL,
  13171. NULL,
  13172. LLM_FFN_SILU, LLM_FFN_PAR, il);
  13173. cb(cur_mlp, "ffn_mlp", il);
  13174. // MoE branch
  13175. ggml_tensor * cur_moe = build_moe_ffn(cur,
  13176. model.layers[il].ffn_gate_inp,
  13177. model.layers[il].ffn_up_exps,
  13178. model.layers[il].ffn_gate_exps,
  13179. model.layers[il].ffn_down_exps,
  13180. nullptr,
  13181. n_expert, n_expert_used,
  13182. LLM_FFN_SILU,
  13183. true, // norm_topk_prob
  13184. false,
  13185. 0.0,
  13186. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  13187. il);
  13188. cb(cur_moe, "ffn_moe_out", il);
  13189. ggml_tensor * ffn_out = ggml_add(ctx0, cur_moe, cur_mlp);
  13190. cb(ffn_out, "ffn_out", il);
  13191. cur = ggml_add(ctx0, ffn_out, ffn_inp);
  13192. cur = build_cvec(cur, il);
  13193. cb(cur, "l_out", il);
  13194. // input for next layer
  13195. inpL = cur;
  13196. }
  13197. cur = inpL;
  13198. cur = build_norm(cur,
  13199. model.output_norm, NULL,
  13200. LLM_NORM_RMS, -1);
  13201. cb(cur, "result_norm", -1);
  13202. res->t_embd = cur;
  13203. // lm_head
  13204. cur = build_lora_mm(model.output, cur);
  13205. cb(cur, "result_output", -1);
  13206. res->t_logits = cur;
  13207. ggml_build_forward_expand(gf, cur);
  13208. }
  13209. };
  13210. struct llm_build_smollm3 : public llm_graph_context {
  13211. llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
  13212. const int64_t n_embd_head = hparams.n_embd_head_v;
  13213. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  13214. GGML_ASSERT(n_embd_head == hparams.n_rot);
  13215. ggml_tensor * cur;
  13216. ggml_tensor * inpL;
  13217. inpL = build_inp_embd(model.tok_embd);
  13218. // inp_pos - contains the positions
  13219. ggml_tensor * inp_pos = build_inp_pos();
  13220. auto * inp_attn = build_attn_inp_kv_unified();
  13221. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  13222. ggml_tensor * inp_out_ids = build_inp_out_ids();
  13223. for (int il = 0; il < n_layer; ++il) {
  13224. ggml_tensor * inpSA = inpL;
  13225. const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
  13226. // norm
  13227. cur = build_norm(inpL,
  13228. model.layers[il].attn_norm, NULL,
  13229. LLM_NORM_RMS, il);
  13230. cb(cur, "attn_norm", il);
  13231. // self-attention
  13232. {
  13233. // compute Q and K and RoPE them
  13234. ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
  13235. cb(Qcur, "Qcur", il);
  13236. if (model.layers[il].bq) {
  13237. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  13238. cb(Qcur, "Qcur", il);
  13239. }
  13240. ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
  13241. cb(Kcur, "Kcur", il);
  13242. if (model.layers[il].bk) {
  13243. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  13244. cb(Kcur, "Kcur", il);
  13245. }
  13246. ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
  13247. cb(Vcur, "Vcur", il);
  13248. if (model.layers[il].bv) {
  13249. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  13250. cb(Vcur, "Vcur", il);
  13251. }
  13252. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  13253. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  13254. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
  13255. if (use_rope) {
  13256. Qcur = ggml_rope_ext(
  13257. ctx0, Qcur, inp_pos, nullptr,
  13258. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13259. ext_factor, attn_factor, beta_fast, beta_slow
  13260. );
  13261. Kcur = ggml_rope_ext(
  13262. ctx0, Kcur, inp_pos, nullptr,
  13263. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13264. ext_factor, attn_factor, beta_fast, beta_slow
  13265. );
  13266. }
  13267. cb(Qcur, "Qcur", il);
  13268. cb(Kcur, "Kcur", il);
  13269. cb(Vcur, "Vcur", il);
  13270. cur = build_attn(inp_attn,
  13271. model.layers[il].wo, model.layers[il].bo,
  13272. Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
  13273. cb(cur, "attn_out", il);
  13274. }
  13275. if (il == n_layer - 1 && inp_out_ids) {
  13276. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  13277. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  13278. }
  13279. ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  13280. cb(ffn_inp, "ffn_inp", il);
  13281. // feed-forward network
  13282. {
  13283. cur = build_norm(ffn_inp,
  13284. model.layers[il].ffn_norm, NULL,
  13285. LLM_NORM_RMS, il);
  13286. cb(cur, "ffn_norm", il);
  13287. cur = build_ffn(cur,
  13288. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  13289. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  13290. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  13291. NULL,
  13292. LLM_FFN_SILU, LLM_FFN_PAR, il);
  13293. cb(cur, "ffn_out", il);
  13294. }
  13295. cur = ggml_add(ctx0, cur, ffn_inp);
  13296. cb(cur, "ffn_out", il);
  13297. cur = build_cvec(cur, il);
  13298. cb(cur, "l_out", il);
  13299. // input for next layer
  13300. inpL = cur;
  13301. }
  13302. cur = inpL;
  13303. cur = build_norm(cur,
  13304. model.output_norm, NULL,
  13305. LLM_NORM_RMS, -1);
  13306. cb(cur, "result_norm", -1);
  13307. res->t_embd = cur;
  13308. // lm_head
  13309. cur = build_lora_mm(model.output, cur);
  13310. cb(cur, "result_output", -1);
  13311. res->t_logits = cur;
  13312. ggml_build_forward_expand(gf, cur);
  13313. }
  13314. };
  13315. struct llm_build_lfm2 : public llm_graph_context {
  13316. const llama_model & model;
  13317. llm_build_lfm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
  13318. ggml_tensor * cur = build_inp_embd(model.tok_embd);
  13319. cb(cur, "model.embed_tokens", -1);
  13320. ggml_tensor * inp_pos = build_inp_pos();
  13321. auto * inp_hybrid = build_inp_mem_hybrid();
  13322. ggml_tensor * inp_out_ids = build_inp_out_ids();
  13323. for (int il = 0; il < n_layer; ++il) {
  13324. auto * prev_cur = cur;
  13325. cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
  13326. cb(cur, "model.layers.{}.operator_norm", il);
  13327. cur = hparams.is_recurrent(il) ?
  13328. build_shortconv_block(cur, inp_hybrid->get_recr(), il) :
  13329. build_attn_block(cur, inp_pos, inp_hybrid->get_attn(), il) ;
  13330. if (il == n_layer - 1 && inp_out_ids) {
  13331. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  13332. prev_cur = ggml_get_rows(ctx0, prev_cur, inp_out_ids);
  13333. }
  13334. cur = ggml_add(ctx0, prev_cur, cur);
  13335. cur = ggml_add(ctx0, cur, build_feed_forward(cur, il));
  13336. }
  13337. cur = build_norm(cur, model.tok_norm, NULL, LLM_NORM_RMS, -1);
  13338. cb(cur, "model.embedding_norm", -1);
  13339. res->t_embd = cur;
  13340. // lm_head is tied with embeddings
  13341. cur = build_lora_mm(model.tok_embd, cur);
  13342. cb(cur, "lm_head", -1);
  13343. res->t_logits = cur;
  13344. ggml_build_forward_expand(gf, cur);
  13345. }
  13346. ggml_tensor * build_feed_forward(ggml_tensor * cur,
  13347. int il) const {
  13348. cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
  13349. cb(cur, "model.layers.{}.ffn_norm", il);
  13350. GGML_ASSERT(!model.layers[il].ffn_up_b);
  13351. GGML_ASSERT(!model.layers[il].ffn_gate_b);
  13352. GGML_ASSERT(!model.layers[il].ffn_down_b);
  13353. cur = build_ffn(cur,
  13354. model.layers[il].ffn_up, NULL, NULL,
  13355. model.layers[il].ffn_gate, NULL, NULL,
  13356. model.layers[il].ffn_down, NULL, NULL,
  13357. NULL,
  13358. LLM_FFN_SILU, LLM_FFN_PAR, il);
  13359. cb(cur, "model.layers.{}.feed_forward.w2", il);
  13360. return cur;
  13361. }
  13362. ggml_tensor * build_attn_block(ggml_tensor * cur,
  13363. ggml_tensor * inp_pos,
  13364. llm_graph_input_attn_kv_unified * inp_attn,
  13365. int il) const {
  13366. GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
  13367. auto const n_embd_head = hparams.n_embd_head_v;
  13368. auto const n_head_kv = hparams.n_head_kv(il);
  13369. auto * q = build_lora_mm(model.layers[il].wq, cur);
  13370. cb(q, "model.layers.{}.self_attn.q_proj", il);
  13371. auto * k = build_lora_mm(model.layers[il].wk, cur);
  13372. cb(k, "model.layers.{}.self_attn.k_proj", il);
  13373. auto * v = build_lora_mm(model.layers[il].wv, cur);
  13374. cb(v, "model.layers.{}.self_attn.v_proj", il);
  13375. q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens);
  13376. k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head_kv, n_tokens);
  13377. v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens);
  13378. // qk norm
  13379. q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
  13380. cb(q, "model.layers.{}.self_attn.q_layernorm", il);
  13381. k = build_norm(k, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
  13382. cb(k, "model.layers.{}.self_attn.k_layernorm", il);
  13383. // RoPE
  13384. q = ggml_rope_ext(
  13385. ctx0, q, inp_pos, nullptr,
  13386. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13387. ext_factor, attn_factor, beta_fast, beta_slow
  13388. );
  13389. k = ggml_rope_ext(
  13390. ctx0, k, inp_pos, nullptr,
  13391. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  13392. ext_factor, attn_factor, beta_fast, beta_slow
  13393. );
  13394. cur = build_attn(inp_attn, model.layers[il].wo, NULL,
  13395. q, k, v, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
  13396. cb(cur, "model.layers.{}.self_attn.out_proj", il);
  13397. return cur;
  13398. }
  13399. ggml_tensor * build_shortconv_block(ggml_tensor * cur,
  13400. llm_graph_input_rs * inp_recr,
  13401. int il) {
  13402. const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
  13403. const uint32_t kv_head = mctx_cur->get_head();
  13404. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  13405. const int64_t n_seqs = ubatch.n_seqs;
  13406. GGML_ASSERT(n_seqs != 0);
  13407. GGML_ASSERT(ubatch.equal_seqs());
  13408. GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
  13409. GGML_ASSERT(hparams.n_shortconv_l_cache > 1);
  13410. const uint32_t d_conv = hparams.n_shortconv_l_cache - 1;
  13411. // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
  13412. cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
  13413. auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur);
  13414. cb(bcx, "model.layers.{}.conv.in_proj", il);
  13415. constexpr auto n_chunks = 3;
  13416. GGML_ASSERT(bcx->ne[0] % n_chunks == 0);
  13417. auto const chunk_size = bcx->ne[0] / n_chunks;
  13418. auto * b = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 0*chunk_size*ggml_element_size(bcx));
  13419. auto * c = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 1*chunk_size*ggml_element_size(bcx));
  13420. auto * x = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 2*chunk_size*ggml_element_size(bcx));
  13421. auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x));
  13422. // read conv state
  13423. auto * conv_state = mctx_cur->get_r_l(il);
  13424. auto * conv_rs = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs);
  13425. auto * conv = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs);
  13426. bx = ggml_concat(ctx0, conv, bx, 0);
  13427. GGML_ASSERT(bx->ne[0] > conv->ne[0]);
  13428. // last d_conv columns is a new conv state
  13429. auto * new_conv = ggml_view_3d(ctx0, bx, conv->ne[0], bx->ne[1], bx->ne[2], bx->nb[1], bx->nb[2], (bx->ne[0] - conv->ne[0])*ggml_element_size(bx));
  13430. GGML_ASSERT(ggml_are_same_shape(conv, new_conv));
  13431. // write new conv conv state
  13432. ggml_build_forward_expand(
  13433. gf,
  13434. ggml_cpy(
  13435. ctx0,
  13436. new_conv,
  13437. ggml_view_1d(
  13438. ctx0,
  13439. conv_state,
  13440. ggml_nelements(new_conv),
  13441. kv_head*d_conv*n_embd*ggml_element_size(new_conv)
  13442. )
  13443. )
  13444. );
  13445. auto * conv_kernel = model.layers[il].shortconv.conv;
  13446. auto * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel);
  13447. cb(conv_out, "model.layers.{}.conv.conv", il);
  13448. auto * y = ggml_mul(ctx0, c, conv_out);
  13449. y = build_lora_mm(model.layers[il].shortconv.out_proj, y);
  13450. cb(y, "model.layers.{}.conv.out_proj", il);
  13451. // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
  13452. y = ggml_reshape_2d(ctx0, y, y->ne[0], n_seq_tokens * n_seqs);
  13453. return y;
  13454. }
  13455. };
  13456. llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
  13457. llama_memory_i * res;
  13458. switch (arch) {
  13459. // Models that need specific instantiation should be handled in the
  13460. // switch statement
  13461. case LLM_ARCH_BERT:
  13462. case LLM_ARCH_JINA_BERT_V2:
  13463. case LLM_ARCH_NOMIC_BERT:
  13464. case LLM_ARCH_NOMIC_BERT_MOE:
  13465. case LLM_ARCH_NEO_BERT:
  13466. case LLM_ARCH_WAVTOKENIZER_DEC:
  13467. case LLM_ARCH_DREAM:
  13468. {
  13469. res = nullptr;
  13470. } break;
  13471. // Models that need standard caching should rely on recurrent/hybrid
  13472. // checks
  13473. default:
  13474. {
  13475. if (llm_arch_is_recurrent(arch)) {
  13476. res = new llama_memory_recurrent(
  13477. *this,
  13478. nullptr,
  13479. GGML_TYPE_F32,
  13480. GGML_TYPE_F32,
  13481. cparams.offload_kqv,
  13482. std::max((uint32_t) 1, cparams.n_seq_max),
  13483. cparams.n_seq_max);
  13484. } else if (llm_arch_is_hybrid(arch)) {
  13485. const auto padding = llama_kv_cache_unified::get_padding(cparams);
  13486. cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
  13487. res = new llama_memory_hybrid(
  13488. /* model */ *this,
  13489. /* attn_type_k */ params.type_k,
  13490. /* attn_type_v */ params.type_v,
  13491. /* attn_v_trans */ !cparams.flash_attn,
  13492. /* attn_kv_size */ cparams.n_ctx,
  13493. /* attn_n_pad */ padding,
  13494. /* attn_n_swa */ hparams.n_swa,
  13495. /* attn_swa_type */ hparams.swa_type,
  13496. /* recurrent_type_k */ GGML_TYPE_F32,
  13497. /* recurrent_type_v */ GGML_TYPE_F32,
  13498. /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
  13499. /* n_seq_max */ cparams.n_seq_max,
  13500. /* offload */ cparams.offload_kqv,
  13501. /* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
  13502. /* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
  13503. } else {
  13504. const auto padding = llama_kv_cache_unified::get_padding(cparams);
  13505. uint32_t n_ctx_per_stream = cparams.n_ctx;
  13506. if (!cparams.kv_unified) {
  13507. n_ctx_per_stream = (cparams.n_ctx + cparams.n_seq_max - 1)/cparams.n_seq_max;
  13508. n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
  13509. cparams.n_ctx = n_ctx_per_stream*cparams.n_seq_max;
  13510. } else {
  13511. n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
  13512. cparams.n_ctx = n_ctx_per_stream;
  13513. }
  13514. LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
  13515. if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
  13516. GGML_ASSERT(hparams.is_swa_any());
  13517. res = new llama_kv_cache_unified_iswa(
  13518. *this,
  13519. params.type_k,
  13520. params.type_v,
  13521. !cparams.flash_attn,
  13522. cparams.offload_kqv,
  13523. params.swa_full,
  13524. cparams.kv_unified,
  13525. n_ctx_per_stream,
  13526. cparams.n_seq_max,
  13527. cparams.n_ubatch,
  13528. padding);
  13529. } else {
  13530. GGML_ASSERT(!hparams.is_swa_any());
  13531. res = new llama_kv_cache_unified(
  13532. *this,
  13533. nullptr,
  13534. params.type_k,
  13535. params.type_v,
  13536. !cparams.flash_attn,
  13537. cparams.offload_kqv,
  13538. cparams.kv_unified,
  13539. n_ctx_per_stream,
  13540. cparams.n_seq_max,
  13541. padding,
  13542. hparams.n_swa,
  13543. hparams.swa_type);
  13544. }
  13545. }
  13546. }
  13547. }
  13548. return res;
  13549. }
  13550. ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
  13551. std::unique_ptr<llm_graph_context> llm;
  13552. switch (arch) {
  13553. case LLM_ARCH_LLAMA:
  13554. {
  13555. llm = std::make_unique<llm_build_llama>(*this, params);
  13556. } break;
  13557. case LLM_ARCH_LLAMA4:
  13558. {
  13559. llm = std::make_unique<llm_build_llama_iswa>(*this, params);
  13560. } break;
  13561. case LLM_ARCH_DECI:
  13562. {
  13563. llm = std::make_unique<llm_build_deci>(*this, params);
  13564. } break;
  13565. case LLM_ARCH_BAICHUAN:
  13566. {
  13567. llm = std::make_unique<llm_build_baichuan>(*this, params);
  13568. } break;
  13569. case LLM_ARCH_FALCON:
  13570. {
  13571. llm = std::make_unique<llm_build_falcon>(*this, params);
  13572. } break;
  13573. case LLM_ARCH_GROK:
  13574. {
  13575. llm = std::make_unique<llm_build_grok>(*this, params);
  13576. } break;
  13577. case LLM_ARCH_STARCODER:
  13578. {
  13579. llm = std::make_unique<llm_build_starcoder>(*this, params);
  13580. } break;
  13581. case LLM_ARCH_REFACT:
  13582. {
  13583. llm = std::make_unique<llm_build_refact>(*this, params);
  13584. } break;
  13585. case LLM_ARCH_BERT:
  13586. case LLM_ARCH_JINA_BERT_V2:
  13587. case LLM_ARCH_NOMIC_BERT:
  13588. case LLM_ARCH_NOMIC_BERT_MOE:
  13589. {
  13590. llm = std::make_unique<llm_build_bert>(*this, params);
  13591. } break;
  13592. case LLM_ARCH_NEO_BERT:
  13593. {
  13594. llm = std::make_unique<llm_build_neo_bert>(*this, params);
  13595. } break;
  13596. case LLM_ARCH_BLOOM:
  13597. {
  13598. llm = std::make_unique<llm_build_bloom>(*this, params);
  13599. } break;
  13600. case LLM_ARCH_MPT:
  13601. {
  13602. llm = std::make_unique<llm_build_mpt>(*this, params);
  13603. } break;
  13604. case LLM_ARCH_STABLELM:
  13605. {
  13606. llm = std::make_unique<llm_build_stablelm>(*this, params);
  13607. } break;
  13608. case LLM_ARCH_QWEN:
  13609. {
  13610. llm = std::make_unique<llm_build_qwen>(*this, params);
  13611. } break;
  13612. case LLM_ARCH_QWEN2:
  13613. {
  13614. llm = std::make_unique<llm_build_qwen2>(*this, params);
  13615. } break;
  13616. case LLM_ARCH_DREAM:
  13617. {
  13618. llm = std::make_unique<llm_build_dream>(*this, params);
  13619. }
  13620. break;
  13621. case LLM_ARCH_QWEN2VL:
  13622. {
  13623. llm = std::make_unique<llm_build_qwen2vl>(*this, params);
  13624. } break;
  13625. case LLM_ARCH_QWEN2MOE:
  13626. {
  13627. llm = std::make_unique<llm_build_qwen2moe>(*this, params);
  13628. } break;
  13629. case LLM_ARCH_QWEN3:
  13630. {
  13631. llm = std::make_unique<llm_build_qwen3>(*this, params);
  13632. } break;
  13633. case LLM_ARCH_QWEN3MOE:
  13634. {
  13635. llm = std::make_unique<llm_build_qwen3moe>(*this, params);
  13636. } break;
  13637. case LLM_ARCH_PHI2:
  13638. {
  13639. llm = std::make_unique<llm_build_phi2>(*this, params);
  13640. } break;
  13641. case LLM_ARCH_PHI3:
  13642. case LLM_ARCH_PHIMOE:
  13643. {
  13644. if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
  13645. llm = std::make_unique<llm_build_phi3<true>> (*this, params);
  13646. } else {
  13647. llm = std::make_unique<llm_build_phi3<false>>(*this, params);
  13648. }
  13649. } break;
  13650. case LLM_ARCH_PLAMO:
  13651. {
  13652. llm = std::make_unique<llm_build_plamo>(*this, params);
  13653. } break;
  13654. case LLM_ARCH_PLAMO2:
  13655. {
  13656. llm = std::make_unique<llm_build_plamo2>(*this, params);
  13657. } break;
  13658. case LLM_ARCH_GPT2:
  13659. {
  13660. llm = std::make_unique<llm_build_gpt2>(*this, params);
  13661. } break;
  13662. case LLM_ARCH_CODESHELL:
  13663. {
  13664. llm = std::make_unique<llm_build_codeshell>(*this, params);
  13665. } break;
  13666. case LLM_ARCH_ORION:
  13667. {
  13668. llm = std::make_unique<llm_build_orion>(*this, params);
  13669. } break;
  13670. case LLM_ARCH_INTERNLM2:
  13671. {
  13672. llm = std::make_unique<llm_build_internlm2>(*this, params);
  13673. } break;
  13674. case LLM_ARCH_MINICPM3:
  13675. {
  13676. llm = std::make_unique<llm_build_minicpm3>(*this, params);
  13677. } break;
  13678. case LLM_ARCH_GEMMA:
  13679. {
  13680. llm = std::make_unique<llm_build_gemma>(*this, params);
  13681. } break;
  13682. case LLM_ARCH_GEMMA2:
  13683. {
  13684. llm = std::make_unique<llm_build_gemma2_iswa>(*this, params);
  13685. } break;
  13686. case LLM_ARCH_GEMMA3:
  13687. {
  13688. llm = std::make_unique<llm_build_gemma3_iswa>(*this, params);
  13689. } break;
  13690. case LLM_ARCH_GEMMA3N:
  13691. {
  13692. llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
  13693. } break;
  13694. case LLM_ARCH_STARCODER2:
  13695. {
  13696. llm = std::make_unique<llm_build_starcoder2>(*this, params);
  13697. } break;
  13698. case LLM_ARCH_MAMBA:
  13699. case LLM_ARCH_MAMBA2:
  13700. {
  13701. llm = std::make_unique<llm_build_mamba>(*this, params);
  13702. } break;
  13703. case LLM_ARCH_JAMBA:
  13704. {
  13705. llm = std::make_unique<llm_build_jamba>(*this, params);
  13706. } break;
  13707. case LLM_ARCH_XVERSE:
  13708. {
  13709. llm = std::make_unique<llm_build_xverse>(*this, params);
  13710. } break;
  13711. case LLM_ARCH_COMMAND_R:
  13712. {
  13713. llm = std::make_unique<llm_build_command_r>(*this, params);
  13714. } break;
  13715. case LLM_ARCH_COHERE2:
  13716. {
  13717. llm = std::make_unique<llm_build_cohere2_iswa>(*this, params);
  13718. } break;
  13719. case LLM_ARCH_DBRX:
  13720. {
  13721. llm = std::make_unique<llm_build_dbrx>(*this, params);
  13722. } break;
  13723. case LLM_ARCH_OLMO:
  13724. {
  13725. llm = std::make_unique<llm_build_olmo>(*this, params);
  13726. } break;
  13727. case LLM_ARCH_OLMO2:
  13728. {
  13729. llm = std::make_unique<llm_build_olmo2>(*this, params);
  13730. } break;
  13731. case LLM_ARCH_OLMOE:
  13732. {
  13733. llm = std::make_unique<llm_build_olmoe>(*this, params);
  13734. } break;
  13735. case LLM_ARCH_OPENELM:
  13736. {
  13737. llm = std::make_unique<llm_build_openelm>(*this, params);
  13738. } break;
  13739. case LLM_ARCH_GPTNEOX:
  13740. {
  13741. llm = std::make_unique<llm_build_gptneox>(*this, params);
  13742. } break;
  13743. case LLM_ARCH_ARCTIC:
  13744. {
  13745. llm = std::make_unique<llm_build_arctic>(*this, params);
  13746. } break;
  13747. case LLM_ARCH_DEEPSEEK:
  13748. {
  13749. llm = std::make_unique<llm_build_deepseek>(*this, params);
  13750. } break;
  13751. case LLM_ARCH_DEEPSEEK2:
  13752. {
  13753. llm = std::make_unique<llm_build_deepseek2>(*this, params);
  13754. } break;
  13755. case LLM_ARCH_CHATGLM:
  13756. {
  13757. llm = std::make_unique<llm_build_chatglm>(*this, params);
  13758. } break;
  13759. case LLM_ARCH_GLM4:
  13760. {
  13761. llm = std::make_unique<llm_build_glm4>(*this, params);
  13762. } break;
  13763. case LLM_ARCH_BITNET:
  13764. {
  13765. llm = std::make_unique<llm_build_bitnet>(*this, params);
  13766. } break;
  13767. case LLM_ARCH_T5:
  13768. {
  13769. switch (params.gtype) {
  13770. case LLM_GRAPH_TYPE_ENCODER:
  13771. llm = std::make_unique<llm_build_t5_enc>(*this, params);
  13772. break;
  13773. case LLM_GRAPH_TYPE_DEFAULT:
  13774. case LLM_GRAPH_TYPE_DECODER:
  13775. llm = std::make_unique<llm_build_t5_dec>(*this, params);
  13776. break;
  13777. default:
  13778. GGML_ABORT("invalid graph type");
  13779. };
  13780. } break;
  13781. case LLM_ARCH_T5ENCODER:
  13782. {
  13783. llm = std::make_unique<llm_build_t5_enc>(*this, params);
  13784. }
  13785. break;
  13786. case LLM_ARCH_JAIS:
  13787. {
  13788. llm = std::make_unique<llm_build_jais>(*this, params);
  13789. } break;
  13790. case LLM_ARCH_NEMOTRON:
  13791. {
  13792. llm = std::make_unique<llm_build_nemotron>(*this, params);
  13793. } break;
  13794. case LLM_ARCH_EXAONE:
  13795. {
  13796. llm = std::make_unique<llm_build_exaone>(*this, params);
  13797. } break;
  13798. case LLM_ARCH_EXAONE4:
  13799. {
  13800. if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
  13801. llm = std::make_unique<llm_build_exaone4<true>>(*this, params);
  13802. } else {
  13803. llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
  13804. }
  13805. } break;
  13806. case LLM_ARCH_RWKV6:
  13807. {
  13808. llm = std::make_unique<llm_build_rwkv6>(*this, params);
  13809. } break;
  13810. case LLM_ARCH_RWKV6QWEN2:
  13811. {
  13812. llm = std::make_unique<llm_build_rwkv6qwen2>(*this, params);
  13813. } break;
  13814. case LLM_ARCH_RWKV7:
  13815. {
  13816. llm = std::make_unique<llm_build_rwkv7>(*this, params);
  13817. } break;
  13818. case LLM_ARCH_ARWKV7:
  13819. {
  13820. llm = std::make_unique<llm_build_arwkv7>(*this, params);
  13821. } break;
  13822. case LLM_ARCH_GRANITE:
  13823. case LLM_ARCH_GRANITE_MOE:
  13824. case LLM_ARCH_MINICPM:
  13825. {
  13826. llm = std::make_unique<llm_build_granite>(*this, params);
  13827. } break;
  13828. case LLM_ARCH_GRANITE_HYBRID:
  13829. {
  13830. llm = std::make_unique<llm_build_granite_hybrid>(*this, params);
  13831. } break;
  13832. case LLM_ARCH_CHAMELEON:
  13833. {
  13834. llm = std::make_unique<llm_build_chameleon>(*this, params);
  13835. } break;
  13836. case LLM_ARCH_WAVTOKENIZER_DEC:
  13837. {
  13838. llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
  13839. } break;
  13840. case LLM_ARCH_PLM:
  13841. {
  13842. llm = std::make_unique<llm_build_plm>(*this, params);
  13843. } break;
  13844. case LLM_ARCH_BAILINGMOE:
  13845. {
  13846. llm = std::make_unique<llm_build_bailingmoe>(*this, params);
  13847. } break;
  13848. case LLM_ARCH_DOTS1:
  13849. {
  13850. llm = std::make_unique<llm_build_dots1>(*this, params);
  13851. } break;
  13852. case LLM_ARCH_ARCEE:
  13853. {
  13854. llm = std::make_unique<llm_build_arcee>(*this, params);
  13855. } break;
  13856. case LLM_ARCH_ERNIE4_5:
  13857. {
  13858. llm = std::make_unique<llm_build_ernie4_5>(*this, params);
  13859. } break;
  13860. case LLM_ARCH_ERNIE4_5_MOE:
  13861. {
  13862. llm = std::make_unique<llm_build_ernie4_5_moe>(*this, params);
  13863. } break;
  13864. case LLM_ARCH_HUNYUAN_MOE:
  13865. {
  13866. llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
  13867. } break;
  13868. case LLM_ARCH_SMOLLM3:
  13869. {
  13870. llm = std::make_unique<llm_build_smollm3>(*this, params);
  13871. } break;
  13872. case LLM_ARCH_FALCON_H1:
  13873. {
  13874. llm = std::make_unique<llm_build_falcon_h1>(*this, params);
  13875. } break;
  13876. case LLM_ARCH_LFM2:
  13877. {
  13878. llm = std::make_unique<llm_build_lfm2>(*this, params);
  13879. } break;
  13880. default:
  13881. GGML_ABORT("fatal error");
  13882. }
  13883. // add on pooling layer
  13884. llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
  13885. return llm->res->get_gf();
  13886. }
  13887. //
  13888. // interface implementation
  13889. //
  13890. llama_model_params llama_model_default_params() {
  13891. llama_model_params result = {
  13892. /*.devices =*/ nullptr,
  13893. /*.tensor_buft_overrides =*/ nullptr,
  13894. /*.n_gpu_layers =*/ 0,
  13895. /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
  13896. /*.main_gpu =*/ 0,
  13897. /*.tensor_split =*/ nullptr,
  13898. /*.progress_callback =*/ nullptr,
  13899. /*.progress_callback_user_data =*/ nullptr,
  13900. /*.kv_overrides =*/ nullptr,
  13901. /*.vocab_only =*/ false,
  13902. /*.use_mmap =*/ true,
  13903. /*.use_mlock =*/ false,
  13904. /*.check_tensors =*/ false,
  13905. };
  13906. #ifdef GGML_USE_METAL
  13907. // note: we usually have plenty of VRAM, so by default offload all layers to the GPU
  13908. result.n_gpu_layers = 999;
  13909. #endif
  13910. return result;
  13911. }
  13912. const llama_vocab * llama_model_get_vocab(const llama_model * model) {
  13913. return &model->vocab;
  13914. }
  13915. void llama_free_model(llama_model * model) {
  13916. llama_model_free(model);
  13917. }
  13918. void llama_model_free(llama_model * model) {
  13919. delete model;
  13920. }
  13921. int32_t llama_model_n_ctx_train(const llama_model * model) {
  13922. return model->hparams.n_ctx_train;
  13923. }
  13924. int32_t llama_model_n_embd(const llama_model * model) {
  13925. return model->hparams.n_embd;
  13926. }
  13927. int32_t llama_model_n_layer(const llama_model * model) {
  13928. return model->hparams.n_layer;
  13929. }
  13930. int32_t llama_model_n_head(const llama_model * model) {
  13931. return model->hparams.n_head();
  13932. }
  13933. int32_t llama_model_n_head_kv(const llama_model * model) {
  13934. return model->hparams.n_head_kv();
  13935. }
  13936. int32_t llama_model_n_swa(const llama_model * model) {
  13937. return model->hparams.n_swa;
  13938. }
  13939. uint32_t llama_model_n_cls_out(const struct llama_model * model) {
  13940. return model->hparams.n_cls_out;
  13941. }
  13942. const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
  13943. if (i < model->classifier_labels.size()) {
  13944. return model->classifier_labels[i].c_str();
  13945. }
  13946. return nullptr;
  13947. }
  13948. // deprecated
  13949. int32_t llama_n_ctx_train(const llama_model * model) {
  13950. return llama_model_n_ctx_train(model);
  13951. }
  13952. // deprecated
  13953. int32_t llama_n_embd(const llama_model * model) {
  13954. return llama_model_n_embd(model);
  13955. }
  13956. // deprecated
  13957. int32_t llama_n_layer(const llama_model * model) {
  13958. return llama_model_n_layer(model);
  13959. }
  13960. // deprecated
  13961. int32_t llama_n_head(const llama_model * model) {
  13962. return llama_model_n_head(model);
  13963. }
  13964. llama_rope_type llama_model_rope_type(const llama_model * model) {
  13965. switch (model->arch) {
  13966. // these models do not use RoPE
  13967. case LLM_ARCH_GPT2:
  13968. case LLM_ARCH_GPTJ:
  13969. case LLM_ARCH_MPT:
  13970. case LLM_ARCH_REFACT:
  13971. case LLM_ARCH_BLOOM:
  13972. case LLM_ARCH_MAMBA:
  13973. case LLM_ARCH_MAMBA2:
  13974. case LLM_ARCH_JAMBA:
  13975. case LLM_ARCH_JINA_BERT_V2:
  13976. case LLM_ARCH_T5:
  13977. case LLM_ARCH_T5ENCODER:
  13978. case LLM_ARCH_JAIS:
  13979. case LLM_ARCH_RWKV6:
  13980. case LLM_ARCH_RWKV6QWEN2:
  13981. case LLM_ARCH_RWKV7:
  13982. case LLM_ARCH_ARWKV7:
  13983. case LLM_ARCH_WAVTOKENIZER_DEC:
  13984. return LLAMA_ROPE_TYPE_NONE;
  13985. // use what we call a normal RoPE, operating on pairs of consecutive head values
  13986. case LLM_ARCH_LLAMA:
  13987. case LLM_ARCH_LLAMA4:
  13988. case LLM_ARCH_DECI:
  13989. case LLM_ARCH_BAICHUAN:
  13990. case LLM_ARCH_STARCODER:
  13991. case LLM_ARCH_INTERNLM2:
  13992. case LLM_ARCH_MINICPM:
  13993. case LLM_ARCH_XVERSE:
  13994. case LLM_ARCH_COMMAND_R:
  13995. case LLM_ARCH_COHERE2:
  13996. case LLM_ARCH_OLMO:
  13997. case LLM_ARCH_ARCTIC:
  13998. case LLM_ARCH_DEEPSEEK:
  13999. case LLM_ARCH_DEEPSEEK2:
  14000. case LLM_ARCH_PLM:
  14001. case LLM_ARCH_CHATGLM:
  14002. case LLM_ARCH_GLM4:
  14003. case LLM_ARCH_GRANITE:
  14004. case LLM_ARCH_GRANITE_MOE:
  14005. case LLM_ARCH_GRANITE_HYBRID:
  14006. case LLM_ARCH_CHAMELEON:
  14007. case LLM_ARCH_BAILINGMOE:
  14008. case LLM_ARCH_NEO_BERT:
  14009. case LLM_ARCH_SMOLLM3:
  14010. case LLM_ARCH_ARCEE:
  14011. case LLM_ARCH_ERNIE4_5:
  14012. case LLM_ARCH_ERNIE4_5_MOE:
  14013. return LLAMA_ROPE_TYPE_NORM;
  14014. // the pairs of head values are offset by n_rot/2
  14015. case LLM_ARCH_FALCON:
  14016. case LLM_ARCH_FALCON_H1:
  14017. case LLM_ARCH_GROK:
  14018. case LLM_ARCH_DBRX:
  14019. case LLM_ARCH_BERT:
  14020. case LLM_ARCH_NOMIC_BERT:
  14021. case LLM_ARCH_NOMIC_BERT_MOE:
  14022. case LLM_ARCH_STABLELM:
  14023. case LLM_ARCH_BITNET:
  14024. case LLM_ARCH_QWEN:
  14025. case LLM_ARCH_QWEN2:
  14026. case LLM_ARCH_DREAM:
  14027. case LLM_ARCH_QWEN2MOE:
  14028. case LLM_ARCH_QWEN3:
  14029. case LLM_ARCH_QWEN3MOE:
  14030. case LLM_ARCH_OLMO2:
  14031. case LLM_ARCH_OLMOE:
  14032. case LLM_ARCH_PHI2:
  14033. case LLM_ARCH_PHI3:
  14034. case LLM_ARCH_PHIMOE:
  14035. case LLM_ARCH_PLAMO:
  14036. case LLM_ARCH_PLAMO2:
  14037. case LLM_ARCH_GEMMA:
  14038. case LLM_ARCH_GEMMA2:
  14039. case LLM_ARCH_GEMMA3:
  14040. case LLM_ARCH_GEMMA3N:
  14041. case LLM_ARCH_STARCODER2:
  14042. case LLM_ARCH_OPENELM:
  14043. case LLM_ARCH_GPTNEOX:
  14044. case LLM_ARCH_CODESHELL:
  14045. case LLM_ARCH_ORION:
  14046. case LLM_ARCH_NEMOTRON:
  14047. case LLM_ARCH_EXAONE:
  14048. case LLM_ARCH_EXAONE4:
  14049. case LLM_ARCH_MINICPM3:
  14050. case LLM_ARCH_DOTS1:
  14051. case LLM_ARCH_HUNYUAN_MOE:
  14052. case LLM_ARCH_LFM2:
  14053. return LLAMA_ROPE_TYPE_NEOX;
  14054. case LLM_ARCH_QWEN2VL:
  14055. return LLAMA_ROPE_TYPE_MROPE;
  14056. // all model arches should be listed explicitly here
  14057. case LLM_ARCH_UNKNOWN:
  14058. GGML_ABORT("unknown architecture");
  14059. }
  14060. return LLAMA_ROPE_TYPE_NONE;
  14061. }
  14062. float llama_model_rope_freq_scale_train(const llama_model * model) {
  14063. return model->hparams.rope_freq_scale_train;
  14064. }
  14065. int32_t llama_model_meta_val_str(const llama_model * model, const char * key, char * buf, size_t buf_size) {
  14066. const auto & it = model->gguf_kv.find(key);
  14067. if (it == model->gguf_kv.end()) {
  14068. if (buf_size > 0) {
  14069. buf[0] = '\0';
  14070. }
  14071. return -1;
  14072. }
  14073. return snprintf(buf, buf_size, "%s", it->second.c_str());
  14074. }
  14075. int32_t llama_model_meta_count(const llama_model * model) {
  14076. return (int)model->gguf_kv.size();
  14077. }
  14078. int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
  14079. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  14080. if (buf_size > 0) {
  14081. buf[0] = '\0';
  14082. }
  14083. return -1;
  14084. }
  14085. auto it = model->gguf_kv.begin();
  14086. std::advance(it, i);
  14087. return snprintf(buf, buf_size, "%s", it->first.c_str());
  14088. }
  14089. int32_t llama_model_meta_val_str_by_index(const llama_model * model, int32_t i, char * buf, size_t buf_size) {
  14090. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  14091. if (buf_size > 0) {
  14092. buf[0] = '\0';
  14093. }
  14094. return -1;
  14095. }
  14096. auto it = model->gguf_kv.begin();
  14097. std::advance(it, i);
  14098. return snprintf(buf, buf_size, "%s", it->second.c_str());
  14099. }
  14100. int32_t llama_model_desc(const llama_model * model, char * buf, size_t buf_size) {
  14101. return snprintf(buf, buf_size, "%s", model->desc().c_str());
  14102. }
  14103. uint64_t llama_model_size(const llama_model * model) {
  14104. return model->size();
  14105. }
  14106. const char * llama_model_chat_template(const llama_model * model, const char * name) {
  14107. const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
  14108. : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
  14109. const auto & it = model->gguf_kv.find(key);
  14110. if (it == model->gguf_kv.end()) {
  14111. // one-off fix for very popular models (so we are not flooded with issues)
  14112. // do not extend this list unless absolutely necessary
  14113. // Mistral-Small-2503 does not have built-in chat template
  14114. llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
  14115. if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
  14116. return "mistral-v7-tekken";
  14117. }
  14118. return nullptr;
  14119. }
  14120. return it->second.c_str();
  14121. }
  14122. uint64_t llama_model_n_params(const llama_model * model) {
  14123. return model->n_elements();
  14124. }
  14125. bool llama_model_has_encoder(const llama_model * model) {
  14126. switch (model->arch) {
  14127. case LLM_ARCH_T5: return true;
  14128. case LLM_ARCH_T5ENCODER: return true;
  14129. default: return false;
  14130. }
  14131. }
  14132. bool llama_model_has_decoder(const llama_model * model) {
  14133. switch (model->arch) {
  14134. case LLM_ARCH_T5ENCODER: return false;
  14135. default: return true;
  14136. }
  14137. }
  14138. llama_token llama_model_decoder_start_token(const llama_model * model) {
  14139. return model->hparams.dec_start_token_id;
  14140. }
  14141. bool llama_model_is_recurrent(const llama_model * model) {
  14142. return llm_arch_is_recurrent(model->arch);
  14143. }
  14144. const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
  14145. return model->tensors_by_name;
  14146. }