| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048704970507051705270537054705570567057705870597060706170627063706470657066706770687069707070717072707370747075707670777078707970807081708270837084708570867087708870897090709170927093709470957096709770987099710071017102710371047105710671077108710971107111711271137114711571167117711871197120712171227123712471257126712771287129713071317132713371347135713671377138713971407141714271437144714571467147714871497150715171527153715471557156715771587159716071617162716371647165716671677168716971707171717271737174717571767177717871797180718171827183718471857186718771887189719071917192719371947195719671977198719972007201720272037204720572067207720872097210721172127213721472157216721772187219722072217222722372247225722672277228722972307231723272337234723572367237723872397240724172427243724472457246724772487249725072517252725372547255725672577258725972607261726272637264726572667267726872697270727172727273727472757276727772787279728072817282728372847285728672877288728972907291729272937294729572967297729872997300730173027303730473057306730773087309731073117312731373147315731673177318731973207321732273237324732573267327732873297330733173327333733473357336733773387339734073417342734373447345734673477348734973507351735273537354735573567357735873597360736173627363736473657366736773687369737073717372737373747375737673777378737973807381738273837384738573867387738873897390739173927393739473957396739773987399740074017402740374047405740674077408740974107411741274137414741574167417741874197420742174227423742474257426742774287429743074317432743374347435743674377438743974407441744274437444744574467447744874497450745174527453745474557456745774587459746074617462746374647465746674677468746974707471747274737474747574767477747874797480748174827483748474857486748774887489749074917492749374947495749674977498749975007501750275037504750575067507750875097510751175127513751475157516751775187519752075217522752375247525752675277528752975307531753275337534753575367537753875397540754175427543754475457546754775487549755075517552755375547555755675577558755975607561756275637564756575667567756875697570757175727573757475757576757775787579758075817582758375847585758675877588758975907591759275937594759575967597759875997600760176027603760476057606760776087609761076117612761376147615761676177618761976207621762276237624762576267627762876297630763176327633763476357636763776387639764076417642764376447645764676477648764976507651765276537654765576567657765876597660766176627663766476657666766776687669767076717672767376747675767676777678767976807681768276837684768576867687768876897690769176927693769476957696769776987699770077017702770377047705770677077708770977107711771277137714771577167717771877197720772177227723772477257726772777287729773077317732773377347735773677377738773977407741774277437744774577467747774877497750775177527753775477557756775777587759776077617762776377647765776677677768776977707771777277737774777577767777777877797780778177827783778477857786778777887789779077917792779377947795779677977798779978007801780278037804780578067807780878097810781178127813781478157816781778187819782078217822782378247825782678277828782978307831783278337834783578367837783878397840784178427843784478457846784778487849785078517852785378547855785678577858785978607861786278637864786578667867786878697870787178727873787478757876787778787879788078817882788378847885788678877888788978907891789278937894789578967897789878997900790179027903790479057906790779087909791079117912791379147915791679177918791979207921792279237924792579267927792879297930793179327933793479357936793779387939794079417942794379447945794679477948794979507951795279537954795579567957795879597960796179627963796479657966796779687969797079717972797379747975797679777978797979807981798279837984798579867987798879897990799179927993799479957996799779987999800080018002800380048005800680078008800980108011801280138014801580168017801880198020802180228023802480258026802780288029803080318032803380348035803680378038803980408041804280438044804580468047804880498050805180528053805480558056805780588059806080618062806380648065806680678068806980708071807280738074807580768077807880798080808180828083808480858086808780888089809080918092809380948095809680978098809981008101810281038104810581068107810881098110811181128113811481158116811781188119812081218122812381248125812681278128812981308131813281338134813581368137813881398140814181428143814481458146814781488149815081518152815381548155815681578158815981608161816281638164816581668167816881698170817181728173817481758176817781788179818081818182818381848185818681878188818981908191819281938194819581968197819881998200820182028203820482058206820782088209821082118212821382148215821682178218821982208221822282238224822582268227822882298230823182328233823482358236823782388239824082418242824382448245824682478248824982508251825282538254825582568257825882598260826182628263826482658266826782688269827082718272827382748275827682778278827982808281828282838284828582868287828882898290829182928293829482958296829782988299830083018302830383048305830683078308830983108311831283138314831583168317831883198320832183228323832483258326832783288329833083318332833383348335833683378338833983408341834283438344834583468347834883498350835183528353835483558356835783588359836083618362836383648365836683678368836983708371837283738374837583768377837883798380838183828383838483858386838783888389839083918392839383948395839683978398839984008401840284038404840584068407840884098410841184128413841484158416841784188419842084218422842384248425842684278428842984308431843284338434843584368437843884398440844184428443844484458446844784488449845084518452845384548455845684578458845984608461846284638464846584668467846884698470847184728473847484758476847784788479848084818482848384848485848684878488848984908491849284938494849584968497849884998500850185028503850485058506850785088509851085118512851385148515851685178518851985208521852285238524852585268527852885298530853185328533853485358536853785388539854085418542854385448545854685478548854985508551855285538554855585568557855885598560856185628563856485658566856785688569857085718572857385748575857685778578857985808581858285838584858585868587858885898590859185928593859485958596859785988599860086018602860386048605860686078608860986108611861286138614861586168617861886198620862186228623862486258626862786288629863086318632863386348635863686378638863986408641864286438644864586468647864886498650865186528653865486558656865786588659866086618662866386648665866686678668866986708671867286738674867586768677867886798680868186828683868486858686868786888689869086918692869386948695869686978698869987008701870287038704870587068707870887098710871187128713871487158716871787188719872087218722872387248725872687278728872987308731873287338734873587368737873887398740874187428743874487458746874787488749875087518752875387548755875687578758875987608761876287638764876587668767876887698770877187728773877487758776877787788779878087818782878387848785878687878788878987908791879287938794879587968797879887998800880188028803880488058806880788088809881088118812881388148815881688178818881988208821882288238824882588268827882888298830883188328833883488358836883788388839884088418842884388448845884688478848884988508851885288538854885588568857885888598860886188628863886488658866886788688869887088718872887388748875887688778878887988808881888288838884888588868887888888898890889188928893889488958896889788988899890089018902890389048905890689078908890989108911891289138914891589168917891889198920892189228923892489258926892789288929893089318932893389348935893689378938893989408941894289438944894589468947894889498950895189528953895489558956895789588959896089618962896389648965896689678968896989708971897289738974897589768977897889798980898189828983898489858986898789888989899089918992899389948995899689978998899990009001900290039004900590069007900890099010901190129013901490159016901790189019902090219022902390249025902690279028902990309031903290339034903590369037903890399040904190429043904490459046904790489049905090519052905390549055905690579058905990609061906290639064906590669067906890699070907190729073907490759076907790789079908090819082908390849085908690879088908990909091909290939094909590969097909890999100910191029103910491059106910791089109911091119112911391149115911691179118911991209121912291239124912591269127912891299130913191329133913491359136913791389139914091419142914391449145914691479148914991509151915291539154915591569157915891599160916191629163916491659166916791689169917091719172917391749175917691779178917991809181918291839184918591869187918891899190919191929193919491959196919791989199920092019202920392049205920692079208920992109211921292139214921592169217921892199220922192229223922492259226922792289229923092319232923392349235923692379238923992409241924292439244924592469247924892499250925192529253925492559256925792589259926092619262926392649265926692679268926992709271927292739274927592769277927892799280928192829283928492859286928792889289929092919292929392949295929692979298929993009301930293039304930593069307930893099310931193129313931493159316931793189319932093219322932393249325932693279328932993309331933293339334933593369337933893399340934193429343934493459346934793489349935093519352935393549355935693579358935993609361936293639364936593669367936893699370937193729373937493759376937793789379938093819382938393849385938693879388938993909391939293939394939593969397939893999400940194029403940494059406940794089409941094119412941394149415941694179418941994209421942294239424942594269427942894299430943194329433943494359436943794389439944094419442944394449445944694479448944994509451945294539454945594569457945894599460946194629463946494659466946794689469947094719472947394749475947694779478947994809481948294839484948594869487948894899490949194929493949494959496949794989499950095019502950395049505950695079508950995109511951295139514951595169517951895199520952195229523952495259526952795289529953095319532953395349535953695379538953995409541954295439544954595469547954895499550955195529553955495559556955795589559956095619562956395649565956695679568956995709571957295739574957595769577957895799580958195829583958495859586958795889589959095919592959395949595959695979598959996009601960296039604960596069607960896099610961196129613961496159616961796189619962096219622962396249625962696279628962996309631963296339634963596369637963896399640964196429643964496459646964796489649965096519652965396549655965696579658965996609661966296639664966596669667966896699670967196729673967496759676967796789679968096819682968396849685968696879688968996909691969296939694969596969697969896999700970197029703970497059706970797089709971097119712971397149715971697179718971997209721972297239724972597269727972897299730973197329733973497359736973797389739974097419742974397449745974697479748974997509751975297539754975597569757975897599760976197629763976497659766976797689769977097719772977397749775977697779778977997809781978297839784978597869787978897899790979197929793979497959796979797989799980098019802980398049805980698079808980998109811981298139814981598169817981898199820982198229823982498259826982798289829983098319832983398349835983698379838983998409841984298439844984598469847984898499850985198529853985498559856985798589859986098619862986398649865986698679868986998709871987298739874987598769877987898799880988198829883988498859886988798889889989098919892989398949895989698979898989999009901990299039904990599069907990899099910991199129913991499159916991799189919992099219922992399249925992699279928992999309931993299339934993599369937993899399940994199429943994499459946994799489949995099519952995399549955995699579958995999609961996299639964996599669967996899699970997199729973997499759976997799789979998099819982998399849985998699879988998999909991999299939994999599969997999899991000010001100021000310004100051000610007100081000910010100111001210013100141001510016100171001810019100201002110022100231002410025100261002710028100291003010031100321003310034100351003610037100381003910040100411004210043100441004510046100471004810049100501005110052100531005410055100561005710058100591006010061100621006310064100651006610067100681006910070100711007210073100741007510076100771007810079100801008110082100831008410085100861008710088100891009010091100921009310094100951009610097100981009910100101011010210103101041010510106101071010810109101101011110112101131011410115101161011710118101191012010121101221012310124101251012610127101281012910130101311013210133101341013510136101371013810139101401014110142101431014410145101461014710148101491015010151101521015310154101551015610157101581015910160101611016210163101641016510166101671016810169101701017110172101731017410175101761017710178101791018010181101821018310184101851018610187101881018910190101911019210193101941019510196101971019810199102001020110202102031020410205102061020710208102091021010211102121021310214102151021610217102181021910220102211022210223102241022510226102271022810229102301023110232102331023410235102361023710238102391024010241102421024310244102451024610247102481024910250102511025210253102541025510256102571025810259102601026110262102631026410265102661026710268102691027010271102721027310274102751027610277102781027910280102811028210283102841028510286102871028810289102901029110292102931029410295102961029710298102991030010301103021030310304103051030610307103081030910310103111031210313103141031510316103171031810319103201032110322103231032410325103261032710328103291033010331103321033310334103351033610337103381033910340103411034210343103441034510346103471034810349103501035110352103531035410355103561035710358103591036010361103621036310364103651036610367103681036910370103711037210373103741037510376103771037810379103801038110382103831038410385103861038710388103891039010391103921039310394103951039610397103981039910400104011040210403104041040510406104071040810409104101041110412104131041410415104161041710418104191042010421104221042310424104251042610427104281042910430104311043210433104341043510436104371043810439104401044110442104431044410445104461044710448104491045010451104521045310454104551045610457104581045910460104611046210463104641046510466104671046810469104701047110472104731047410475104761047710478104791048010481104821048310484104851048610487104881048910490104911049210493104941049510496104971049810499105001050110502105031050410505105061050710508105091051010511105121051310514105151051610517105181051910520105211052210523105241052510526105271052810529105301053110532105331053410535105361053710538105391054010541105421054310544105451054610547105481054910550105511055210553105541055510556105571055810559105601056110562105631056410565105661056710568105691057010571105721057310574105751057610577105781057910580105811058210583105841058510586105871058810589105901059110592105931059410595105961059710598105991060010601106021060310604106051060610607106081060910610106111061210613106141061510616106171061810619106201062110622106231062410625106261062710628106291063010631106321063310634106351063610637106381063910640106411064210643106441064510646106471064810649106501065110652106531065410655106561065710658106591066010661106621066310664106651066610667106681066910670106711067210673106741067510676106771067810679106801068110682106831068410685106861068710688106891069010691106921069310694106951069610697106981069910700107011070210703107041070510706107071070810709107101071110712107131071410715107161071710718107191072010721107221072310724107251072610727107281072910730107311073210733107341073510736107371073810739107401074110742107431074410745107461074710748107491075010751107521075310754107551075610757107581075910760107611076210763107641076510766107671076810769107701077110772107731077410775107761077710778107791078010781107821078310784107851078610787107881078910790107911079210793107941079510796107971079810799108001080110802108031080410805108061080710808108091081010811108121081310814108151081610817108181081910820108211082210823108241082510826108271082810829108301083110832108331083410835108361083710838108391084010841108421084310844108451084610847108481084910850108511085210853108541085510856108571085810859108601086110862108631086410865108661086710868108691087010871108721087310874108751087610877108781087910880108811088210883108841088510886108871088810889108901089110892108931089410895108961089710898108991090010901109021090310904109051090610907109081090910910109111091210913109141091510916109171091810919109201092110922109231092410925109261092710928109291093010931109321093310934109351093610937109381093910940109411094210943109441094510946109471094810949109501095110952109531095410955109561095710958109591096010961109621096310964109651096610967109681096910970109711097210973109741097510976109771097810979109801098110982109831098410985109861098710988109891099010991109921099310994109951099610997109981099911000110011100211003110041100511006110071100811009110101101111012110131101411015110161101711018110191102011021110221102311024110251102611027110281102911030110311103211033110341103511036110371103811039110401104111042110431104411045110461104711048110491105011051110521105311054110551105611057110581105911060110611106211063110641106511066110671106811069110701107111072110731107411075110761107711078110791108011081110821108311084110851108611087110881108911090110911109211093110941109511096110971109811099111001110111102111031110411105111061110711108111091111011111111121111311114111151111611117111181111911120111211112211123111241112511126111271112811129111301113111132111331113411135111361113711138111391114011141111421114311144111451114611147111481114911150111511115211153111541115511156111571115811159111601116111162111631116411165111661116711168111691117011171111721117311174111751117611177111781117911180111811118211183111841118511186111871118811189111901119111192111931119411195111961119711198111991120011201112021120311204112051120611207112081120911210112111121211213112141121511216112171121811219112201122111222112231122411225112261122711228112291123011231112321123311234112351123611237112381123911240112411124211243112441124511246112471124811249112501125111252112531125411255112561125711258112591126011261112621126311264112651126611267112681126911270112711127211273112741127511276112771127811279112801128111282112831128411285112861128711288112891129011291112921129311294112951129611297112981129911300113011130211303113041130511306113071130811309113101131111312113131131411315113161131711318113191132011321113221132311324113251132611327113281132911330113311133211333113341133511336113371133811339113401134111342113431134411345113461134711348113491135011351113521135311354113551135611357113581135911360113611136211363113641136511366113671136811369113701137111372113731137411375113761137711378113791138011381113821138311384113851138611387113881138911390113911139211393113941139511396113971139811399114001140111402114031140411405114061140711408114091141011411114121141311414114151141611417114181141911420114211142211423114241142511426114271142811429114301143111432114331143411435114361143711438114391144011441114421144311444114451144611447114481144911450114511145211453114541145511456114571145811459114601146111462114631146411465114661146711468114691147011471114721147311474114751147611477114781147911480114811148211483114841148511486114871148811489114901149111492114931149411495114961149711498114991150011501115021150311504115051150611507115081150911510115111151211513115141151511516115171151811519115201152111522115231152411525115261152711528115291153011531115321153311534115351153611537115381153911540115411154211543115441154511546115471154811549115501155111552115531155411555115561155711558115591156011561115621156311564115651156611567115681156911570115711157211573115741157511576115771157811579115801158111582115831158411585115861158711588115891159011591115921159311594115951159611597115981159911600116011160211603116041160511606116071160811609116101161111612116131161411615116161161711618116191162011621116221162311624116251162611627116281162911630116311163211633116341163511636116371163811639116401164111642116431164411645116461164711648116491165011651116521165311654116551165611657116581165911660116611166211663116641166511666116671166811669116701167111672116731167411675116761167711678116791168011681116821168311684116851168611687116881168911690116911169211693116941169511696116971169811699117001170111702117031170411705117061170711708117091171011711117121171311714117151171611717117181171911720117211172211723117241172511726117271172811729117301173111732117331173411735117361173711738117391174011741117421174311744117451174611747117481174911750117511175211753117541175511756117571175811759117601176111762117631176411765117661176711768117691177011771117721177311774117751177611777117781177911780117811178211783117841178511786117871178811789117901179111792117931179411795117961179711798117991180011801118021180311804118051180611807118081180911810118111181211813118141181511816118171181811819118201182111822118231182411825118261182711828118291183011831118321183311834118351183611837118381183911840118411184211843118441184511846118471184811849118501185111852118531185411855118561185711858118591186011861118621186311864118651186611867118681186911870118711187211873118741187511876118771187811879118801188111882118831188411885118861188711888118891189011891118921189311894118951189611897118981189911900119011190211903119041190511906119071190811909119101191111912119131191411915119161191711918119191192011921119221192311924119251192611927119281192911930119311193211933119341193511936119371193811939119401194111942119431194411945119461194711948119491195011951119521195311954119551195611957119581195911960119611196211963119641196511966119671196811969119701197111972119731197411975119761197711978119791198011981119821198311984119851198611987119881198911990119911199211993119941199511996119971199811999120001200112002120031200412005120061200712008120091201012011120121201312014120151201612017120181201912020120211202212023120241202512026120271202812029120301203112032120331203412035120361203712038120391204012041120421204312044120451204612047120481204912050120511205212053120541205512056120571205812059120601206112062120631206412065120661206712068120691207012071120721207312074120751207612077120781207912080120811208212083120841208512086120871208812089120901209112092120931209412095120961209712098120991210012101121021210312104121051210612107121081210912110121111211212113121141211512116121171211812119121201212112122121231212412125121261212712128121291213012131121321213312134121351213612137121381213912140121411214212143121441214512146121471214812149121501215112152121531215412155121561215712158121591216012161121621216312164121651216612167121681216912170121711217212173121741217512176121771217812179121801218112182121831218412185121861218712188121891219012191121921219312194121951219612197121981219912200122011220212203122041220512206122071220812209122101221112212122131221412215122161221712218122191222012221122221222312224122251222612227122281222912230122311223212233122341223512236122371223812239122401224112242122431224412245122461224712248122491225012251122521225312254122551225612257122581225912260122611226212263122641226512266122671226812269122701227112272122731227412275122761227712278122791228012281122821228312284122851228612287122881228912290122911229212293122941229512296122971229812299123001230112302123031230412305123061230712308123091231012311123121231312314123151231612317123181231912320123211232212323123241232512326123271232812329123301233112332123331233412335123361233712338123391234012341123421234312344123451234612347123481234912350123511235212353123541235512356123571235812359123601236112362123631236412365123661236712368123691237012371123721237312374123751237612377123781237912380123811238212383123841238512386123871238812389123901239112392123931239412395123961239712398123991240012401124021240312404124051240612407124081240912410124111241212413124141241512416124171241812419124201242112422124231242412425124261242712428124291243012431124321243312434124351243612437124381243912440124411244212443124441244512446124471244812449124501245112452124531245412455124561245712458124591246012461124621246312464124651246612467124681246912470124711247212473124741247512476124771247812479124801248112482124831248412485124861248712488124891249012491124921249312494124951249612497124981249912500125011250212503125041250512506125071250812509125101251112512125131251412515125161251712518125191252012521125221252312524125251252612527125281252912530125311253212533125341253512536125371253812539125401254112542125431254412545125461254712548125491255012551125521255312554125551255612557125581255912560125611256212563125641256512566125671256812569125701257112572125731257412575125761257712578125791258012581125821258312584125851258612587125881258912590125911259212593125941259512596125971259812599126001260112602126031260412605126061260712608126091261012611126121261312614126151261612617126181261912620126211262212623126241262512626126271262812629126301263112632126331263412635126361263712638126391264012641126421264312644126451264612647126481264912650126511265212653126541265512656126571265812659126601266112662126631266412665126661266712668126691267012671126721267312674126751267612677126781267912680126811268212683126841268512686126871268812689126901269112692126931269412695126961269712698126991270012701127021270312704127051270612707127081270912710127111271212713127141271512716127171271812719127201272112722127231272412725127261272712728127291273012731127321273312734127351273612737127381273912740127411274212743127441274512746127471274812749127501275112752127531275412755127561275712758127591276012761127621276312764127651276612767127681276912770127711277212773127741277512776127771277812779127801278112782127831278412785127861278712788127891279012791127921279312794127951279612797127981279912800128011280212803128041280512806128071280812809128101281112812128131281412815128161281712818128191282012821128221282312824128251282612827128281282912830128311283212833128341283512836128371283812839128401284112842128431284412845128461284712848128491285012851128521285312854128551285612857128581285912860128611286212863128641286512866128671286812869128701287112872128731287412875128761287712878128791288012881128821288312884128851288612887128881288912890128911289212893128941289512896128971289812899129001290112902129031290412905129061290712908129091291012911129121291312914129151291612917129181291912920129211292212923129241292512926129271292812929129301293112932129331293412935129361293712938129391294012941129421294312944129451294612947129481294912950129511295212953129541295512956129571295812959129601296112962129631296412965129661296712968129691297012971129721297312974129751297612977129781297912980129811298212983129841298512986129871298812989129901299112992129931299412995129961299712998129991300013001130021300313004130051300613007130081300913010130111301213013130141301513016130171301813019130201302113022130231302413025130261302713028130291303013031130321303313034130351303613037130381303913040130411304213043130441304513046130471304813049130501305113052130531305413055130561305713058130591306013061130621306313064130651306613067130681306913070130711307213073130741307513076130771307813079130801308113082130831308413085130861308713088130891309013091130921309313094130951309613097130981309913100131011310213103131041310513106131071310813109131101311113112131131311413115131161311713118131191312013121131221312313124131251312613127131281312913130131311313213133131341313513136131371313813139131401314113142131431314413145131461314713148131491315013151131521315313154131551315613157131581315913160131611316213163131641316513166131671316813169131701317113172131731317413175131761317713178131791318013181131821318313184131851318613187131881318913190131911319213193131941319513196131971319813199132001320113202132031320413205132061320713208132091321013211132121321313214132151321613217132181321913220132211322213223132241322513226132271322813229132301323113232132331323413235132361323713238132391324013241132421324313244132451324613247132481324913250132511325213253132541325513256132571325813259132601326113262132631326413265132661326713268132691327013271132721327313274132751327613277132781327913280132811328213283132841328513286132871328813289132901329113292132931329413295132961329713298132991330013301133021330313304133051330613307133081330913310133111331213313133141331513316133171331813319133201332113322133231332413325133261332713328133291333013331133321333313334133351333613337133381333913340133411334213343133441334513346133471334813349133501335113352133531335413355133561335713358133591336013361133621336313364133651336613367133681336913370133711337213373133741337513376133771337813379133801338113382133831338413385133861338713388133891339013391133921339313394133951339613397133981339913400134011340213403134041340513406134071340813409134101341113412134131341413415134161341713418 |
- #include "llama-model.h"
- #include "llama-impl.h"
- #include "llama-mmap.h"
- #include "llama-batch.h"
- #include "llama-cparams.h"
- #include "llama-model-loader.h"
- #include "llama-kv-cache.h"
- #include "ggml-cpp.h"
- #include <algorithm>
- #include <cassert>
- #include <cmath>
- #include <cfloat>
- #include <cstring>
- #include <cmath>
- #include <functional>
- #include <map>
- #include <regex>
- #include <sstream>
- #include <stdexcept>
- const char * llm_type_name(llm_type type) {
- switch (type) {
- case LLM_TYPE_14M: return "14M";
- case LLM_TYPE_17M: return "17M";
- case LLM_TYPE_22M: return "22M";
- case LLM_TYPE_33M: return "33M";
- case LLM_TYPE_60M: return "60M";
- case LLM_TYPE_70M: return "70M";
- case LLM_TYPE_80M: return "80M";
- case LLM_TYPE_109M: return "109M";
- case LLM_TYPE_137M: return "137M";
- case LLM_TYPE_160M: return "160M";
- case LLM_TYPE_190M: return "190M";
- case LLM_TYPE_220M: return "220M";
- case LLM_TYPE_250M: return "250M";
- case LLM_TYPE_270M: return "270M";
- case LLM_TYPE_335M: return "335M";
- case LLM_TYPE_410M: return "410M";
- case LLM_TYPE_450M: return "450M";
- case LLM_TYPE_475M: return "475M";
- case LLM_TYPE_770M: return "770M";
- case LLM_TYPE_780M: return "780M";
- case LLM_TYPE_0_5B: return "0.5B";
- case LLM_TYPE_0_6B: return "0.6B";
- case LLM_TYPE_1B: return "1B";
- case LLM_TYPE_1_3B: return "1.3B";
- case LLM_TYPE_1_4B: return "1.4B";
- case LLM_TYPE_1_5B: return "1.5B";
- case LLM_TYPE_1_6B: return "1.6B";
- case LLM_TYPE_1_7B: return "1.7B";
- case LLM_TYPE_1_8B: return "1.8B";
- case LLM_TYPE_2B: return "2B";
- case LLM_TYPE_2_8B: return "2.8B";
- case LLM_TYPE_2_9B: return "2.9B";
- case LLM_TYPE_3B: return "3B";
- case LLM_TYPE_4B: return "4B";
- case LLM_TYPE_6B: return "6B";
- case LLM_TYPE_6_9B: return "6.9B";
- case LLM_TYPE_7B: return "7B";
- case LLM_TYPE_8B: return "8B";
- case LLM_TYPE_9B: return "9B";
- case LLM_TYPE_11B: return "11B";
- case LLM_TYPE_12B: return "12B";
- case LLM_TYPE_13B: return "13B";
- case LLM_TYPE_14B: return "14B";
- case LLM_TYPE_15B: return "15B";
- case LLM_TYPE_16B: return "16B";
- case LLM_TYPE_20B: return "20B";
- case LLM_TYPE_27B: return "27B";
- case LLM_TYPE_30B: return "30B";
- case LLM_TYPE_32B: return "32B";
- case LLM_TYPE_34B: return "34B";
- case LLM_TYPE_35B: return "35B";
- case LLM_TYPE_40B: return "40B";
- case LLM_TYPE_65B: return "65B";
- case LLM_TYPE_70B: return "70B";
- case LLM_TYPE_236B: return "236B";
- case LLM_TYPE_290B: return "290B";
- case LLM_TYPE_314B: return "314B";
- case LLM_TYPE_405B: return "405B";
- case LLM_TYPE_671B: return "671B";
- case LLM_TYPE_SMALL: return "0.1B";
- case LLM_TYPE_MEDIUM: return "0.4B";
- case LLM_TYPE_LARGE: return "0.8B";
- case LLM_TYPE_XL: return "1.5B";
- case LLM_TYPE_A1_7B: return "A1.7B";
- case LLM_TYPE_A2_7B: return "A2.7B";
- case LLM_TYPE_8x7B: return "8x7B";
- case LLM_TYPE_8x22B: return "8x22B";
- case LLM_TYPE_16x12B: return "16x12B";
- case LLM_TYPE_16x3_8B: return "16x3.8B";
- case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
- case LLM_TYPE_57B_A14B: return "57B.A14B";
- case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
- case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
- case LLM_TYPE_30B_A3B: return "30B.A3B";
- case LLM_TYPE_235B_A22B: return "235B.A22B";
- default: return "?B";
- }
- }
- static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
- switch (type) {
- case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
- case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
- default: return "unknown";
- }
- }
- static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
- { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
- { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
- { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
- { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
- };
- static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
- for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
- if (kv.second == name) {
- return (llama_rope_scaling_type) kv.first;
- }
- }
- return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
- }
- // checks if the weight tensor can be used with the specified buffer type and device
- static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
- GGML_ASSERT(w != nullptr);
- if (op == GGML_OP_NONE) {
- return true;
- }
- ggml_init_params params = {
- /*.mem_size =*/ ggml_tensor_overhead()*8,
- /*.mem_buffer =*/ NULL,
- /*.no_alloc =*/ true,
- };
- ggml_context_ptr ctx_ptr { ggml_init(params) };
- if (!ctx_ptr) {
- throw std::runtime_error(format("failed to create ggml context"));
- }
- ggml_context * ctx = ctx_ptr.get();
- ggml_tensor * op_tensor = nullptr;
- switch (op) {
- case GGML_OP_GET_ROWS:
- {
- ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
- op_tensor = ggml_get_rows(ctx, w, b);
- } break;
- case GGML_OP_MUL_MAT:
- {
- ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
- op_tensor = ggml_mul_mat(ctx, w, b);
- } break;
- case GGML_OP_MUL_MAT_ID:
- {
- int n_expert_used = hparams.n_expert_used;
- ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
- ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
- op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
- } break;
- case GGML_OP_ADD:
- {
- ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
- op_tensor = ggml_add(ctx, a, w);
- } break;
- case GGML_OP_MUL:
- {
- ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
- op_tensor = ggml_mul(ctx, a, w);
- } break;
- case GGML_OP_DIV:
- {
- ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
- op_tensor = ggml_div(ctx, a, w);
- } break;
- case GGML_OP_ROPE:
- {
- int n_embd_head = hparams.n_embd_head_v;
- int n_head = hparams.n_head();
- ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
- ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
- op_tensor = ggml_rope_ext(
- ctx, a, b, w,
- 0, 0, 0, 0, 0,
- 0, 0, 0, 0
- );
- } break;
- case GGML_OP_SSM_CONV:
- {
- // FIXME
- ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 12345, w->ne[1], 6789);
- op_tensor = ggml_ssm_conv(ctx, conv_x, w);
- } break;
- case GGML_OP_SSM_SCAN:
- {
- // FIXME
- const int64_t d_state = w->ne[0];
- const int64_t d_inner = w->ne[1];
- const int64_t n_seq_tokens = 512;
- const int64_t n_seqs = 1;
- ggml_tensor * s = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, d_inner, n_seqs);
- ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
- ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
- ggml_tensor * B = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
- ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
- op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C);
- } break;
- case GGML_OP_RWKV_WKV6:
- {
- // FIXME
- const int64_t S = 123;
- const int64_t H = 123;
- const int64_t n_tokens = 123;
- const int64_t n_seqs = 123;
- ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
- ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
- ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
- ggml_tensor * tf = w;
- ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
- ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
- op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
- } break;
- case GGML_OP_IM2COL:
- {
- const int n_embd = hparams.n_embd;
- ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
- op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
- } break;
- default:
- GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
- }
- // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
- GGML_ASSERT(w->buffer == nullptr);
- w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
- bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
- ggml_backend_buffer_free(w->buffer);
- w->buffer = nullptr;
- return op_supported;
- }
- // lists of buffer types used for each layer
- using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
- // find the first buffer type in the list that can use the tensor
- static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
- GGML_ASSERT(!buft_list.empty());
- for (const auto & cur : buft_list) {
- ggml_backend_dev_t cur_dev = cur.first;
- ggml_backend_buffer_type_t cur_buft = cur.second;
- if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
- return cur_buft;
- }
- }
- return nullptr;
- }
- // CPU: ACCEL -> GPU host -> CPU extra -> CPU
- static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
- buft_list_t buft_list;
- // add ACCEL buffer types
- for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
- ggml_backend_dev_t dev = ggml_backend_dev_get(i);
- if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
- auto * buft = ggml_backend_dev_buffer_type(dev);
- // skip
- if (buft != ggml_backend_cpu_buffer_type()) {
- buft_list.emplace_back(dev, buft);
- }
- }
- }
- // add a host buffer type
- // storing the tensors in a host buffer is useful when the processing of large batches
- // is offloaded to a GPU device, since it reduces the time spent on data transfers
- // generally, this will be done using the first device in the list
- // a better approach would be to handle this on a weight-by-weight basis using the offload_op
- // function of the device to determine if it would benefit from being stored in a host buffer
- for (auto * dev : devices) {
- ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
- if (buft) {
- buft_list.emplace_back(dev, buft);
- break;
- }
- }
- // add extra buffer types, only if no GPU device is present
- // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
- auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
- auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
- ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
- if (ggml_backend_dev_get_extra_bufts_fn) {
- ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
- while (extra_bufts && *extra_bufts) {
- buft_list.emplace_back(cpu_dev, *extra_bufts);
- ++extra_bufts;
- }
- }
- // add the CPU buffer type
- for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
- ggml_backend_dev_t dev = ggml_backend_dev_get(i);
- if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
- buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
- }
- }
- return buft_list;
- }
- // GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
- static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode split_mode, const float * tensor_split) {
- buft_list_t buft_list;
- // add the device split buffer type if requested and available
- if (split_mode == LLAMA_SPLIT_MODE_ROW) {
- ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
- auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
- ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
- if (ggml_backend_split_buffer_type_fn) {
- size_t dev_index = [&]() {
- auto * reg = ggml_backend_dev_backend_reg(dev);
- for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
- if (ggml_backend_reg_dev_get(reg, i) == dev) {
- return i;
- }
- }
- throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
- }();
- auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
- if (buft != nullptr) {
- buft_list.emplace_back(dev, buft);
- }
- }
- }
- // add the device default buffer type
- buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
- return buft_list;
- }
- struct llama_model::impl {
- impl() {}
- ~impl() {}
- uint64_t n_elements = 0;
- size_t n_bytes = 0;
- std::string desc_str;
- // model memory mapped files
- llama_mmaps mappings;
- // objects representing data potentially being locked in memory
- llama_mlocks mlock_bufs;
- llama_mlocks mlock_mmaps;
- // contexts where the model tensors metadata is stored
- std::vector<ggml_context_ptr> ctxs;
- // the model memory buffers for the tensor data
- std::vector<ggml_backend_buffer_ptr> bufs;
- buft_list_t cpu_buft_list;
- std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
- struct layer_dev {
- ggml_backend_dev_t dev;
- buft_list_t * buft_list;
- };
- layer_dev dev_input = {};
- layer_dev dev_output = {};
- std::vector<layer_dev> dev_layer;
- bool has_tensor_overrides;
- };
- llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
- pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
- }
- llama_model::~llama_model() {}
- void llama_model::load_stats(llama_model_loader & ml) {
- pimpl->n_elements = ml.n_elements;
- pimpl->n_bytes = ml.n_bytes;
- }
- void llama_model::load_arch(llama_model_loader & ml) {
- arch = ml.get_arch();
- if (arch == LLM_ARCH_UNKNOWN) {
- throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
- }
- }
- void llama_model::load_hparams(llama_model_loader & ml) {
- const gguf_context * ctx = ml.meta.get();
- // get metadata as string
- for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
- gguf_type type = gguf_get_kv_type(ctx, i);
- if (type == GGUF_TYPE_ARRAY) {
- continue;
- }
- const char * name = gguf_get_key(ctx, i);
- const std::string value = gguf_kv_to_str(ctx, i);
- gguf_kv.emplace(name, value);
- }
- // get general kv
- ml.get_key(LLM_KV_GENERAL_NAME, name, false);
- // everything past this point is not vocab-related
- if (hparams.vocab_only) {
- return;
- }
- ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
- ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
- ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
- ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
- ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
- if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
- ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
- ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
- ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
- ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
- ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
- }
- GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
- GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
- if (hparams.n_expert > 0) {
- GGML_ASSERT(hparams.n_expert_used > 0);
- } else {
- GGML_ASSERT(hparams.n_expert_used == 0);
- }
- // zero-out the array hparams
- std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
- std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
- std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
- ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
- ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
- // n_head_kv is optional, default to n_head
- hparams.n_head_kv_arr = hparams.n_head_arr;
- ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
- bool rope_finetuned = false;
- ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
- hparams.rope_finetuned = rope_finetuned;
- hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
- ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
- // rope_freq_base (optional)
- hparams.rope_freq_base_train = 10000.0f;
- ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
- std::string rope_scaling("linear");
- ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
- hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
- GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
- // rope_freq_scale (inverse of the kv) is optional
- float ropescale = 0.0f;
- if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
- // try the old key name
- ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
- }
- hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
- // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
- hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
- hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
- ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
- // non-transformer models do not have attention heads
- if (hparams.n_head() > 0) {
- // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
- // gpt-j n_rot = rotary_dim
- hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
- ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
- hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
- ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
- // sanity check for n_rot (optional)
- hparams.n_rot = hparams.n_embd_head_k;
- ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
- if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
- if (hparams.n_rot != hparams.n_embd_head_k) {
- throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
- }
- }
- } else {
- hparams.n_rot = 0;
- hparams.n_embd_head_k = 0;
- hparams.n_embd_head_v = 0;
- }
- // for differentiating model types
- uint32_t n_vocab = 0;
- ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
- // arch-specific KVs
- switch (arch) {
- case LLM_ARCH_LLAMA:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- if (hparams.n_expert == 8) {
- switch (hparams.n_layer) {
- case 32: type = LLM_TYPE_8x7B; break;
- case 56: type = LLM_TYPE_8x22B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } else {
- switch (hparams.n_layer) {
- case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
- case 22: type = LLM_TYPE_1B; break;
- case 26: type = LLM_TYPE_3B; break;
- case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
- // granite uses a vocab with len 49152
- case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
- case 36: type = LLM_TYPE_8B; break; // granite
- case 40: type = LLM_TYPE_13B; break;
- case 48: type = LLM_TYPE_34B; break;
- case 60: type = LLM_TYPE_30B; break;
- case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- }
- } break;
- case LLM_ARCH_LLAMA4:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
- ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
- hparams.n_swa_pattern = 4; // pattern: 3 chunked - 1 full
- hparams.n_attn_chunk = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
- hparams.n_swa = 1; // TODO @ngxson : this is added to trigger the SWA branch (we store the chunked attn mask in the SWA tensor), will need to clean this up later
- switch (hparams.n_expert) {
- case 16: type = LLM_TYPE_17B_16E; break;
- case 128: type = LLM_TYPE_17B_128E; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- if (type == LLM_TYPE_17B_128E) {
- hparams.use_kq_norm = false;
- }
- } break;
- case LLM_ARCH_DECI:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 32: type = LLM_TYPE_7B; break;
- case 80: type = LLM_TYPE_70B; break;
- case 162: type = LLM_TYPE_405B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_MINICPM:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
- ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
- ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
- switch (hparams.n_layer) {
- case 52: type = LLM_TYPE_1B; break;
- case 40: type = LLM_TYPE_2B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_MINICPM3:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
- ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
- switch (hparams.n_layer) {
- case 62: type = LLM_TYPE_4B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_GROK:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 64: type = LLM_TYPE_314B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_FALCON:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- switch (hparams.n_layer) {
- case 32: type = LLM_TYPE_7B; break;
- case 60: type = LLM_TYPE_40B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_BAICHUAN:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 32: type = LLM_TYPE_7B; break;
- case 40: type = LLM_TYPE_13B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- if (type == LLM_TYPE_13B) {
- // TODO: become GGUF KV parameter
- hparams.f_max_alibi_bias = 8.0f;
- }
- } break;
- case LLM_ARCH_STARCODER:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- switch (hparams.n_layer) {
- case 24: type = LLM_TYPE_1B; break;
- case 36: type = LLM_TYPE_3B; break;
- case 42: type = LLM_TYPE_7B; break;
- case 40: type = LLM_TYPE_15B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_REFACT:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 32: type = LLM_TYPE_1B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- // TODO: become GGUF KV parameter
- hparams.f_max_alibi_bias = 8.0f;
- } break;
- case LLM_ARCH_BERT:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
- switch (hparams.n_layer) {
- case 3:
- type = LLM_TYPE_17M; break; // bge-micro
- case 6:
- type = LLM_TYPE_22M; break; // MiniLM-L6
- case 12:
- switch (hparams.n_embd) {
- case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
- case 768: type = LLM_TYPE_109M; break; // bge-base
- default: type = LLM_TYPE_UNKNOWN;
- } break;
- case 24:
- type = LLM_TYPE_335M; break; // bge-large
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_JINA_BERT_V2:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
- hparams.f_max_alibi_bias = 8.0f;
- switch (hparams.n_layer) {
- case 4: type = LLM_TYPE_33M; break; // jina-embeddings-small
- case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_NOMIC_BERT:
- case LLM_ARCH_NOMIC_BERT_MOE:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
- ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
- if (hparams.n_layer == 12 && hparams.n_embd == 768) {
- if (arch == LLM_ARCH_NOMIC_BERT) {
- type = LLM_TYPE_137M;
- } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
- type = LLM_TYPE_475M;
- }
- }
- } break;
- case LLM_ARCH_BLOOM:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- switch (hparams.n_layer) {
- case 24: type = LLM_TYPE_1B; break;
- case 30:
- switch (hparams.n_embd) {
- case 2560: type = LLM_TYPE_3B; break;
- case 4096: type = LLM_TYPE_7B; break;
- default: type = LLM_TYPE_UNKNOWN;
- } break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- // TODO: become GGUF KV parameter
- hparams.f_max_alibi_bias = 8.0f;
- } break;
- case LLM_ARCH_MPT:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
- ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
- switch (hparams.n_layer) {
- case 32: type = LLM_TYPE_7B; break;
- case 48: type = LLM_TYPE_30B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_STABLELM:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- switch (hparams.n_layer) {
- case 24: type = LLM_TYPE_1B; break;
- case 32: type = LLM_TYPE_3B; break;
- case 40: type = LLM_TYPE_12B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_QWEN:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 32: type = LLM_TYPE_7B; break;
- case 40: type = LLM_TYPE_13B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_QWEN2VL:
- {
- ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
- }
- // fall through
- case LLM_ARCH_QWEN2:
- {
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
- case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
- case 32: type = LLM_TYPE_7B; break;
- case 36: type = LLM_TYPE_3B; break;
- case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
- case 48: type = LLM_TYPE_14B; break;
- case 64: type = LLM_TYPE_32B; break;
- case 80: type = LLM_TYPE_70B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_QWEN2MOE:
- {
- ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
- ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 24: type = LLM_TYPE_A2_7B; break;
- case 28: type = LLM_TYPE_57B_A14B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_QWEN3:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
- case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
- case 40: type = LLM_TYPE_14B; break;
- case 64: type = LLM_TYPE_32B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_QWEN3MOE:
- {
- ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 48: type = LLM_TYPE_30B_A3B; break;
- case 94: type = LLM_TYPE_235B_A22B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_PHI2:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- switch (hparams.n_layer) {
- case 24: type = LLM_TYPE_1B; break;
- case 32: type = LLM_TYPE_3B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_PHI3:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 24: type = LLM_TYPE_1B; break;
- case 32: type = LLM_TYPE_3B; break;
- case 40: type = LLM_TYPE_14B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
- if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
- // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
- hparams.n_swa = 2047;
- } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
- // default value for Phi-3-mini-128k-instruct
- // note: this seems incorrect because the window is bigger than the train context?
- hparams.n_swa = 262144;
- } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
- // default value for Phi-3-medium-128k-instruct
- // note: this seems incorrect because the window is equal to the train context?
- hparams.n_swa = 131072;
- }
- bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
- if (!found_swa && hparams.n_swa == 0) {
- throw std::runtime_error("invalid value for sliding_window");
- }
- } break;
- case LLM_ARCH_PHIMOE:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 32: type = LLM_TYPE_16x3_8B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_PLAMO:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 40: type = LLM_TYPE_13B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_GPT2:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- switch (hparams.n_layer) {
- case 12: type = LLM_TYPE_SMALL; break;
- case 24: type = LLM_TYPE_MEDIUM; break;
- case 36: type = LLM_TYPE_LARGE; break;
- case 48: type = LLM_TYPE_XL; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_CODESHELL:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- switch (hparams.n_layer) {
- case 42: type = LLM_TYPE_7B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_ORION:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- switch (hparams.n_layer) {
- case 40: type = LLM_TYPE_14B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_INTERNLM2:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 32: type = LLM_TYPE_7B; break;
- case 48: type = LLM_TYPE_20B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_GEMMA:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 18: type = LLM_TYPE_2B; break;
- case 28: type = LLM_TYPE_7B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_GEMMA2:
- {
- hparams.n_swa = 4096; // default value of gemma 2
- hparams.n_swa_pattern = 2;
- hparams.attn_soft_cap = true;
- ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
- ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
- switch (hparams.n_layer) {
- case 26: type = LLM_TYPE_2B; break;
- case 42: type = LLM_TYPE_9B; break;
- case 46: type = LLM_TYPE_27B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_GEMMA3:
- {
- hparams.n_swa_pattern = 6;
- hparams.rope_freq_base_train_swa = 10000.0f;
- hparams.rope_freq_scale_train_swa = 1.0f;
- ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 26: type = LLM_TYPE_1B; break;
- case 34: type = LLM_TYPE_4B; break;
- case 48: type = LLM_TYPE_12B; break;
- case 62: type = LLM_TYPE_27B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- hparams.f_attention_scale = type == LLM_TYPE_27B
- ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
- : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
- } break;
- case LLM_ARCH_STARCODER2:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- switch (hparams.n_layer) {
- case 30: type = LLM_TYPE_3B; break;
- case 32: type = LLM_TYPE_7B; break;
- case 40: type = LLM_TYPE_15B; break;
- case 52: type = LLM_TYPE_20B; break; // granite
- case 88: type = LLM_TYPE_34B; break; // granite
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_MAMBA:
- {
- ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
- ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
- ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
- ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
- ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 24:
- switch (hparams.n_embd) {
- case 768: type = LLM_TYPE_SMALL; break;
- default: type = LLM_TYPE_UNKNOWN;
- } break;
- case 48:
- switch (hparams.n_embd) {
- case 1024: type = LLM_TYPE_MEDIUM; break;
- case 1536: type = LLM_TYPE_LARGE; break;
- case 2048: type = LLM_TYPE_XL; break;
- default: type = LLM_TYPE_UNKNOWN;
- } break;
- case 64:
- switch (hparams.n_embd) {
- case 2560: type = LLM_TYPE_3B; break;
- default: type = LLM_TYPE_UNKNOWN;
- } break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_XVERSE:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 32: type = LLM_TYPE_7B; break;
- case 40: type = LLM_TYPE_13B; break;
- case 80: type = LLM_TYPE_65B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_COMMAND_R:
- {
- ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- switch (hparams.n_layer) {
- case 40: type = LLM_TYPE_35B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_COHERE2:
- {
- hparams.n_swa_pattern = 4;
- ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
- ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- switch (hparams.n_layer) {
- case 32: type = LLM_TYPE_8B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_DBRX:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
- switch (hparams.n_layer) {
- case 40: type = LLM_TYPE_16x12B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_OLMO:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
- switch (hparams.n_layer) {
- case 22: type = LLM_TYPE_1B; break;
- case 32: type = LLM_TYPE_7B; break;
- case 80: type = LLM_TYPE_70B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_OLMO2:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 16: type = LLM_TYPE_1B; break;
- case 32: type = LLM_TYPE_7B; break;
- case 40: type = LLM_TYPE_13B; break;
- case 64: type = LLM_TYPE_32B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_OLMOE:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 16: type = LLM_TYPE_A1_7B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_OPENELM:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 16: type = LLM_TYPE_270M; break;
- case 20: type = LLM_TYPE_450M; break;
- case 28: type = LLM_TYPE_1B; break;
- case 36: type = LLM_TYPE_3B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_GPTNEOX:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
- switch (hparams.n_layer) {
- case 6:
- switch (hparams.n_ff()) {
- case 512: type = LLM_TYPE_14M; break;
- case 2048: type = LLM_TYPE_70M; break;
- default: type = LLM_TYPE_UNKNOWN;
- } break;
- case 12:
- switch (hparams.n_ff()) {
- case 3072: type = LLM_TYPE_160M; break;
- default: type = LLM_TYPE_UNKNOWN;
- } break;
- case 16:
- switch (hparams.n_ff()) {
- case 8192: type = LLM_TYPE_1B; break;
- default: type = LLM_TYPE_UNKNOWN;
- } break;
- case 24:
- switch (hparams.n_ff()) {
- case 4096: type = LLM_TYPE_410M; break;
- case 8192: type = LLM_TYPE_1_4B; break;
- default: type = LLM_TYPE_UNKNOWN;
- } break;
- case 32:
- switch (hparams.n_ff()) {
- case 10240: type = LLM_TYPE_2_8B; break;
- case 16384: type = LLM_TYPE_6_9B; break;
- default: type = LLM_TYPE_UNKNOWN;
- } break;
- case 36:
- switch (hparams.n_ff()) {
- case 20480: type = LLM_TYPE_12B; break;
- default: type = LLM_TYPE_UNKNOWN;
- } break;
- case 44:
- switch (hparams.n_ff()) {
- case 24576: type = LLM_TYPE_20B; break;
- default: type = LLM_TYPE_UNKNOWN;
- } break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_ARCTIC:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- if (hparams.n_expert == 128) {
- switch (hparams.n_layer) {
- case 35: type = LLM_TYPE_10B_128x3_66B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } else {
- type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_DEEPSEEK:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
- ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
- ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
- switch (hparams.n_layer) {
- case 28: type = LLM_TYPE_20B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_DEEPSEEK2:
- {
- bool is_lite = (hparams.n_layer == 27);
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
- if (!is_lite) {
- ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
- }
- ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
- ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
- ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
- ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
- ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
- ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
- if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
- // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
- // that have no expert_gating_func model parameter set
- hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
- }
- ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
- switch (hparams.n_layer) {
- case 27: type = LLM_TYPE_16B; break;
- case 60: type = LLM_TYPE_236B; break;
- case 61: type = LLM_TYPE_671B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_PLM:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
- switch (hparams.n_layer) {
- case 32: type = LLM_TYPE_1_8B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_CHATGLM:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 28: {
- if (hparams.n_head(0) == 16) {
- type = LLM_TYPE_1_5B;
- } else {
- type = LLM_TYPE_6B;
- }
- } break;
- case 40: {
- if (hparams.n_head(0) == 24) {
- type = LLM_TYPE_4B;
- } else {
- type = LLM_TYPE_9B;
- }
- } break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_GLM4:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 40: type = LLM_TYPE_9B; break;
- case 61: type = LLM_TYPE_32B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_BITNET:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 26: type = LLM_TYPE_3B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_T5:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
- uint32_t dec_start_token_id;
- if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
- hparams.dec_start_token_id = dec_start_token_id;
- }
- switch (hparams.n_layer) {
- case 6: type = LLM_TYPE_60M; break; // t5-small
- case 8: type = LLM_TYPE_80M; break; // flan-t5-small
- case 12:
- switch (hparams.n_ff()) {
- case 3072: type = LLM_TYPE_220M; break; // t5-base
- case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
- default: type = LLM_TYPE_UNKNOWN;
- } break;
- case 24:
- switch (hparams.n_ff()) {
- case 4096: type = LLM_TYPE_770M; break; // t5-large
- case 2816: type = LLM_TYPE_780M; break; // flan-t5-large
- case 16384: type = LLM_TYPE_3B; break; // t5-3b
- case 5120: type = LLM_TYPE_3B; break; // flan-t5-xl
- case 65536: type = LLM_TYPE_11B; break; // t5-11b
- case 10240: type = LLM_TYPE_11B; break; // flan-t5-xxl
- default: type = LLM_TYPE_UNKNOWN;
- } break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_T5ENCODER:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
- type = LLM_TYPE_UNKNOWN;
- } break;
- case LLM_ARCH_JAIS:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
- switch (hparams.n_layer) {
- case 24: type = LLM_TYPE_1_3B; break;
- case 40: type = LLM_TYPE_13B; break;
- /* TODO: add variants */
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_NEMOTRON:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- switch (hparams.n_layer) {
- case 32: type = LLM_TYPE_4B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_EXAONE:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- switch (hparams.n_layer) {
- case 32: type = LLM_TYPE_8B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_RWKV6:
- case LLM_ARCH_RWKV6QWEN2:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
- ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
- ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
- ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
- ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
- ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
- switch (hparams.n_layer) {
- case 24: type = LLM_TYPE_1_6B; break;
- case 32:
- switch (hparams.n_embd) {
- case 2560: type = LLM_TYPE_3B; break;
- case 4096: type = LLM_TYPE_7B; break;
- default: type = LLM_TYPE_UNKNOWN;
- } break;
- case 61: type = LLM_TYPE_14B; break;
- case 64: type = LLM_TYPE_32B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_RWKV7:
- case LLM_ARCH_ARWKV7:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
- ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
- ml.get_key(LLM_KV_ATTENTION_DECAY_LORA_RANK, hparams.n_lora_decay);
- ml.get_key(LLM_KV_ATTENTION_ICLR_LORA_RANK, hparams.n_lora_iclr);
- ml.get_key(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix);
- ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate, false);
- ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
- switch (hparams.n_layer) {
- case 12: type = LLM_TYPE_190M; break;
- case 24:
- switch (hparams.n_embd) {
- case 1024: type = LLM_TYPE_450M; break;
- case 2048: type = LLM_TYPE_1_5B; break;
- default: type = LLM_TYPE_UNKNOWN;
- } break;
- case 28:
- switch (hparams.n_embd) {
- case 1536: type = LLM_TYPE_1_5B; break;
- case 3584: type = LLM_TYPE_7B; break;
- default: type = LLM_TYPE_UNKNOWN;
- } break;
- case 32: type = LLM_TYPE_2_9B; break; // RWKV-7-World
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_GRANITE:
- case LLM_ARCH_GRANITE_MOE:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
- ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
- ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
- ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
- switch (hparams.n_layer) {
- case 32: type = LLM_TYPE_3B; break;
- case 40: type = LLM_TYPE_3B; break;
- // Add additional layer/vocab/etc checks here for other model sizes
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_CHAMELEON:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
- ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
- switch (hparams.n_layer) {
- case 32: type = LLM_TYPE_7B; break;
- case 48: type = LLM_TYPE_34B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- case LLM_ARCH_WAVTOKENIZER_DEC:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
- ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
- } break;
- case LLM_ARCH_BAILINGMOE:
- {
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
- ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
- ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
- switch (hparams.n_layer) {
- case 28: type = LLM_TYPE_16B; break;
- case 88: type = LLM_TYPE_290B; break;
- default: type = LLM_TYPE_UNKNOWN;
- }
- } break;
- default: throw std::runtime_error("unsupported model architecture");
- }
- pimpl->n_bytes = ml.n_bytes;
- pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
- if (hparams.f_max_alibi_bias > 0.0f) {
- hparams.use_alibi = true;
- }
- hparams.rope_type = llama_model_rope_type(this);
- }
- void llama_model::load_vocab(llama_model_loader & ml) {
- const auto kv = LLM_KV(arch);
- vocab.load(ml, kv);
- }
- bool llama_model::load_tensors(llama_model_loader & ml) {
- const auto & split_mode = params.split_mode;
- const auto & n_gpu_layers = params.n_gpu_layers;
- const auto & use_mlock = params.use_mlock;
- const auto & tensor_split = params.tensor_split;
- const int n_layer = hparams.n_layer;
- const bool use_mmap_buffer = true;
- LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
- // build a list of buffer types for the CPU and GPU devices
- pimpl->cpu_buft_list = make_cpu_buft_list(devices);
- for (auto * dev : devices) {
- buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
- // add CPU buffer types as a fallback
- buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
- pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
- }
- // calculate the split points
- bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
- std::vector<float> splits(n_devices());
- if (all_zero) {
- // default split, by free memory
- for (size_t i = 0; i < n_devices(); ++i) {
- ggml_backend_dev_t dev = devices[i];
- size_t total;
- size_t free;
- ggml_backend_dev_memory(dev, &free, &total);
- splits[i] = free;
- }
- } else {
- std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
- }
- // sum and normalize the splits to get the split points
- float split_sum = 0.0f;
- for (size_t i = 0; i < n_devices(); ++i) {
- split_sum += splits[i];
- splits[i] = split_sum;
- }
- for (size_t i = 0; i < n_devices(); ++i) {
- splits[i] /= split_sum;
- }
- ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
- const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
- const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
- auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
- const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
- if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
- LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
- return {cpu_dev, &pimpl->cpu_buft_list};
- }
- const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
- auto * dev = devices.at(layer_gpu);
- LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
- return {dev, &pimpl->gpu_buft_list.at(dev)};
- };
- // assign the input layer
- // there is very little benefit to offloading the input layer, so always keep it on the CPU
- pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
- // assign the repeating layers to the devices according to the splits
- pimpl->dev_layer.resize(n_layer);
- for (int il = 0; il < n_layer; ++il) {
- pimpl->dev_layer[il] = get_layer_buft_list(il);
- }
- // assign the output layer
- pimpl->dev_output = get_layer_buft_list(n_layer);
- // one ggml context per buffer type
- int max_n_tensors = ml.n_tensors;
- max_n_tensors += 1; // duplicated output tensor
- max_n_tensors += n_layer*2; // duplicated rope freq tensors
- const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
- std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
- auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
- auto it = ctx_map.find(buft);
- if (it == ctx_map.end()) {
- ggml_init_params params = {
- /*.mem_size =*/ ctx_size,
- /*.mem_buffer =*/ NULL,
- /*.no_alloc =*/ true,
- };
- ggml_context * ctx = ggml_init(params);
- if (!ctx) {
- throw std::runtime_error(format("failed to create ggml context"));
- }
- ctx_map[buft] = ctx;
- pimpl->ctxs.emplace_back(ctx);
- return ctx;
- }
- return it->second;
- };
- const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
- const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
- // create tensors for the weights
- {
- // note: cast to int64_t since we will use these for the tensor dimensions
- const int64_t n_head = hparams.n_head();
- const int64_t n_head_kv = hparams.n_head_kv();
- const int64_t n_embd = hparams.n_embd;
- const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
- const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
- const int64_t n_embd_head_k = hparams.n_embd_head_k;
- const int64_t n_embd_head_v = hparams.n_embd_head_v;
- const int64_t n_ff = hparams.n_ff();
- const int64_t n_embd_gqa = n_embd_v_gqa;
- const int64_t n_vocab = vocab.n_tokens();
- const int64_t n_token_types = vocab.n_token_types();
- const int64_t n_rot = hparams.n_rot;
- const int64_t n_expert = hparams.n_expert;
- const int64_t n_expert_used = hparams.n_expert_used;
- const int64_t n_ctx_train = hparams.n_ctx_train;
- if (n_expert > 0 && hparams.n_expert_used == 0) {
- throw std::runtime_error("model has expert layers but no expert layers are used");
- }
- int n_moved_tensors = 0;
- ggml_tensor * first_moved_tensor = nullptr;
- ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
- ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
- auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
- ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
- if (!t_meta) {
- if (flags & TENSOR_NOT_REQUIRED) {
- return nullptr;
- }
- throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
- }
- // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
- // the tensor is duplicated
- // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
- llm_tensor tn_tensor = tn.tensor;
- if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
- tn_tensor = LLM_TENSOR_OUTPUT;
- }
- llm_tensor_info info;
- try {
- info = llm_tensor_info_for(tn_tensor);
- } catch (const std::out_of_range & e) {
- throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
- }
- // skip unused tensors
- if (info.op == GGML_OP_NONE) {
- const size_t nbytes = ggml_nbytes(t_meta);
- LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
- ml.size_data -= nbytes;
- ml.n_created++;
- return nullptr;
- }
- // tensors with "bias" suffix are always used with GGML_OP_ADD
- ggml_op op;
- bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
- if (bias) {
- op = GGML_OP_ADD;
- } else {
- op = info.op;
- }
- // sanity checks
- if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
- if (tn.bid != -1) {
- GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
- }
- } else {
- if (tn.bid == -1) {
- GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
- }
- }
- // select the buffer type for this tensor
- buft_list_t * buft_list;
- switch (info.layer) {
- case LLM_TENSOR_LAYER_INPUT:
- buft_list = pimpl->dev_input.buft_list;
- break;
- case LLM_TENSOR_LAYER_OUTPUT:
- buft_list = pimpl->dev_output.buft_list;
- break;
- case LLM_TENSOR_LAYER_REPEATING:
- buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
- break;
- default:
- GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
- }
- ggml_backend_buffer_type_t buft = nullptr;
- // check overrides
- if (ml.tensor_buft_overrides) {
- std::string tensor_name = tn.str();
- for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
- std::regex pattern(overrides->pattern);
- if (std::regex_search(tensor_name, pattern)) {
- LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft));
- buft = overrides->buft;
- break;
- }
- }
- }
- if (!buft) {
- buft = select_weight_buft(hparams, t_meta, op, *buft_list);
- if (!buft) {
- throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
- }
- }
- // avoid using a host buffer when using mmap
- auto * buft_dev = ggml_backend_buft_get_device(buft);
- if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
- buft = ggml_backend_dev_buffer_type(cpu_dev);
- }
- if (buft != buft_list->front().second) {
- n_moved_tensors++;
- if (!first_moved_tensor) {
- first_moved_tensor = t_meta;
- first_moved_from_buft = buft_list->front().second;
- first_moved_to_buft = buft;
- }
- }
- ggml_context * ctx = ctx_for_buft(buft);
- // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
- if (flags & TENSOR_DUPLICATED) {
- ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
- if (t) {
- return t;
- }
- }
- return ml.create_tensor(ctx, tn, ne, flags);
- };
- layers.resize(n_layer);
- // TODO: move to a separate function
- const auto tn = LLM_TN(arch);
- switch (arch) {
- case LLM_ARCH_LLAMA:
- case LLM_ARCH_REFACT:
- case LLM_ARCH_MINICPM:
- case LLM_ARCH_GRANITE:
- case LLM_ARCH_GRANITE_MOE:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (output == NULL) {
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
- // optional bias tensors
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
- layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
- layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
- }
- else {
- layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
- }
- if (n_expert == 0) {
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- // optional MLP bias
- layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
- } else {
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
- }
- }
- } break;
- case LLM_ARCH_LLAMA4:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (output == NULL) {
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- }
- GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0");
- for (int i = 0; i < n_layer; ++i) {
- bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0;
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
- if (is_moe_layer) {
- int n_ff_exp = hparams.n_ff_exp;
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
- // Shared expert
- const int64_t n_ff_shexp = n_ff_exp;
- layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
- layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
- layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
- } else {
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- }
- } break;
- case LLM_ARCH_DECI:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (output == NULL) {
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
- const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
- const int64_t n_ff = hparams.n_ff(i);
- const int64_t n_head = hparams.n_head(i);
- const int64_t n_head_kv = hparams.n_head_kv(i);
- if (n_head_kv == 0 && n_head > 0) {
- // linear attention for DeciLMCausalModel
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- }
- else if (n_head_kv > 0) {
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
- }
- // optional bias tensors
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- if (n_ff > 0) {
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- }
- if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
- layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
- layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
- }
- else {
- layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
- }
- if (n_ff > 0) {
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- // optional MLP bias
- layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
- }
- } break;
- case LLM_ARCH_MINICPM3:
- {
- const int64_t n_embd_head_qk_rope = hparams.n_rot;
- const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
- const int64_t q_lora_rank = hparams.n_lora_q;
- const int64_t kv_lora_rank = hparams.n_lora_kv;
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (output == NULL) {
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
- layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
- layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
- layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
- layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
- layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
- layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
- }
- } break;
- case LLM_ARCH_GROK:
- {
- if (n_expert == 0) {
- throw std::runtime_error("Grok model cannot have zero experts");
- }
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (output == NULL) {
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
- layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
- }
- } break;
- case LLM_ARCH_DBRX:
- {
- if (n_expert == 0) {
- throw std::runtime_error("DBRX model cannot have zero experts");
- }
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
- }
- } break;
- case LLM_ARCH_BAICHUAN:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- {
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_FALCON:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- {
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- if (!output) {
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
- }
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
- layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_STARCODER:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
- // output
- {
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- if (!output) {
- // needs to be on GPU
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- }
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
- }
- } break;
- case LLM_ARCH_BERT:
- case LLM_ARCH_NOMIC_BERT:
- case LLM_ARCH_NOMIC_BERT_MOE:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
- if (arch == LLM_ARCH_BERT) {
- pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
- cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
- cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
- cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
- cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {1}, TENSOR_NOT_REQUIRED);
- }
- tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
- tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- if (arch == LLM_ARCH_BERT) {
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
- } else {
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
- }
- if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
- }
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
- layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
- if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
- } else {
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
- } else {
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- }
- }
- layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
- layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
- }
- } break;
- case LLM_ARCH_JINA_BERT_V2:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
- type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
- tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
- tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); //LayerNorm bias
- cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
- cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {1}, TENSOR_NOT_REQUIRED);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i]; // JinaBertLayer
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
- layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
- layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
- layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
- layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
- layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
- }
- } break;
- case LLM_ARCH_BLOOM:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
- tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (output == NULL) {
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
- }
- } break;
- case LLM_ARCH_MPT:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- if (!output) {
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- // AWQ ScaleActivation layer
- layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
- }
- } break;
- case LLM_ARCH_STABLELM:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- // optional bias tensors, present in Stable LM 2 1.6B
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
- // optional q and k layernorms, present in StableLM 2 12B
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
- // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_QWEN:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}, 0);
- }
- } break;
- case LLM_ARCH_QWEN2:
- case LLM_ARCH_QWEN2VL:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (output == NULL) {
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- // optional bias tensors
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_QWEN2MOE:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- // optional bias tensors
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
- if (n_expert == 0) {
- throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
- }
- if (n_expert_used == 0) {
- throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
- }
- // MoE branch
- const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
- // Shared expert branch
- const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
- layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
- layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
- layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
- layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
- }
- } break;
- case LLM_ARCH_QWEN3:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (output == NULL) {
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_QWEN3MOE:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
- if (n_expert == 0) {
- throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
- }
- if (n_expert_used == 0) {
- throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
- }
- // MoE branch
- const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
- }
- } break;
- case LLM_ARCH_PHI2:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, 0);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
- if (layer.wqkv == nullptr) {
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
- }
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
- }
- } break;
- case LLM_ARCH_PHI3:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (output == NULL) {
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
- layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
- layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
- }
- } break;
- case LLM_ARCH_PHIMOE:
- {
- const int64_t n_embd_head = n_embd / n_head;
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
- output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
- output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
- if (layer.wqkv == nullptr) {
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
- }
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
- layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
- layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
- }
- } break;
- case LLM_ARCH_PLAMO:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_GPT2:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (output == NULL) {
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
- }
- } break;
- case LLM_ARCH_CODESHELL:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- // if tok embd is NULL, init from output
- if (tok_embd == NULL) {
- tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- }
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
- }
- } break;
- case LLM_ARCH_ORION:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_INTERNLM2:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_GEMMA:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- }
- } break;
- case LLM_ARCH_GEMMA2:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
- layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
- }
- } break;
- case LLM_ARCH_GEMMA3:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (output == NULL) {
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
- layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
- }
- } break;
- case LLM_ARCH_STARCODER2:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (output == NULL) {
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- // optional bias tensors
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- // optional bias tensors
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}, 0);
- }
- } break;
- case LLM_ARCH_MAMBA:
- {
- const int64_t d_conv = hparams.ssm_d_conv;
- const int64_t d_inner = hparams.ssm_d_inner;
- const int64_t d_state = hparams.ssm_d_state;
- const int64_t dt_rank = hparams.ssm_dt_rank;
- // only an expansion factor of 2 is supported for now
- if (2 * n_embd != d_inner) {
- throw std::runtime_error("only an expansion factor of 2 is supported for now");
- }
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed, duplicated to allow offloading
- if (output == NULL) {
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- // norm
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
- layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
- layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
- layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
- layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
- layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
- // no "weight" suffix for these
- layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
- layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
- // out_proj
- layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
- }
- } break;
- case LLM_ARCH_XVERSE:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_COMMAND_R:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- // init output from the input tok embed
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- if (n_layer >= 64){
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
- }
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_COHERE2:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
- // init output from the input tok embed
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
- TENSOR_DUPLICATED);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
- }
- }
- break;
- case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (output == NULL) {
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_OLMO2:
- {
- const int64_t n_embd_head = n_embd / n_head;
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
- layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
- }
- } break;
- case LLM_ARCH_OLMOE:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
- if (n_expert == 0) {
- throw std::runtime_error("n_expert must be > 0");
- }
- if (n_expert_used == 0) {
- throw std::runtime_error("n_expert_used must be > 0");
- }
- // MoE branch
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
- }
- } break;
- case LLM_ARCH_OPENELM:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- // init output from the input tok embed
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- for (int i = 0; i < n_layer; ++i) {
- const int64_t n_head = hparams.n_head(i);
- const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head;
- const int64_t n_ff = hparams.n_ff(i);
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_GPTNEOX:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
- }
- } break;
- case LLM_ARCH_ARCTIC:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (output == NULL) {
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd}, 0);
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
- layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
- }
- } break;
- case LLM_ARCH_DEEPSEEK:
- {
- const int64_t n_ff_exp = hparams.n_ff_exp;
- const int64_t n_expert_shared = hparams.n_expert_shared;
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- if (i < (int) hparams.n_layer_dense_lead) {
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- } else {
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
- if (n_expert == 0) {
- throw std::runtime_error("n_expert must be > 0");
- }
- if (n_expert_used == 0) {
- throw std::runtime_error("n_expert_used must be > 0");
- }
- // MoE branch
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
- // Shared expert branch
- layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
- layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
- layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
- }
- }
- } break;
- case LLM_ARCH_DEEPSEEK2:
- {
- const bool is_lite = (hparams.n_layer == 27);
- const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
- // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
- const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
- const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
- const int64_t n_embd_head_qk_rope = hparams.n_rot;
- const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
- const int64_t q_lora_rank = hparams.n_lora_q;
- const int64_t kv_lora_rank = hparams.n_lora_kv;
- const int64_t n_ff_exp = hparams.n_ff_exp;
- const int64_t n_expert_shared = hparams.n_expert_shared;
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- if (!is_lite) {
- layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
- }
- layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
- if (!is_lite) {
- layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
- layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
- } else {
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
- }
- layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
- // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
- if (is_mla) {
- layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
- layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
- } else {
- layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
- }
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- if (i < (int) hparams.n_layer_dense_lead) {
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- } else {
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
- layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
- if (n_expert == 0) {
- throw std::runtime_error("n_expert must be > 0");
- }
- if (n_expert_used == 0) {
- throw std::runtime_error("n_expert_used must be > 0");
- }
- // MoE branch
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
- // Shared expert branch
- layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
- layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
- layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
- }
- }
- } break;
- case LLM_ARCH_PLM:
- {
- const int64_t n_embd_head_qk_rope = hparams.n_rot;
- const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
- const int64_t kv_lora_rank = hparams.n_lora_kv;
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- // output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
- layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
- layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
- layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_BITNET:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
- }
- } break;
- case LLM_ARCH_T5:
- {
- const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (output == NULL) {
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
- layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
- layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
- layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
- layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
- layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
- // this tensor seems to be unused in HF transformers implementation
- layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
- layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_T5ENCODER:
- {
- const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (output == NULL) {
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
- layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
- layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
- layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_JAIS:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
- }
- } break;
- case LLM_ARCH_CHATGLM:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
- if (layer.wqkv == nullptr) {
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
- }
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- }
- } break;
- case LLM_ARCH_GLM4:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (output == NULL) {
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
- if (layer.wqkv == nullptr) {
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
- }
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
- layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
- }
- } break;
- case LLM_ARCH_NEMOTRON:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- // optional bias tensors
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- // optional MLP bias
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
- }
- } break;
- case LLM_ARCH_EXAONE:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (output == NULL) {
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_RWKV6:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // Block 0, LN0
- tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
- tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- const int time_mix_extra_dim = hparams.time_mix_extra_dim;
- const int time_decay_extra_dim = hparams.time_decay_extra_dim;
- const int head_size = hparams.wkv_head_size;
- const int attn_hidden_size = n_embd;
- const int ffn_size = hparams.n_ff_arr[0];
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
- layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
- layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
- layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
- layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
- layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
- layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
- layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
- layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
- layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
- layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
- layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED);
- GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
- layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
- layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
- layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
- layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
- layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
- layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
- layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
- layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
- layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
- layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
- layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
- layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
- layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
- layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
- layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
- layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
- }
- } break;
- case LLM_ARCH_RWKV6QWEN2:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- const int time_mix_extra_dim = hparams.time_mix_extra_dim;
- const int time_decay_extra_dim = hparams.time_decay_extra_dim;
- const int head_size = hparams.wkv_head_size;
- const int attn_hidden_size = n_embd;
- const int n_head_kv = hparams.n_head_kv();
- int attn_key_value_size;
- if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
- attn_key_value_size = attn_hidden_size;
- } else {
- attn_key_value_size = n_head_kv * head_size;
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
- layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
- layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
- layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
- layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, TENSOR_NOT_REQUIRED);
- layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
- layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
- layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
- layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
- layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
- layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
- layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
- // optional bias tensors
- layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
- layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
- layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, TENSOR_NOT_REQUIRED);
- layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_RWKV7:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // Block 0, LN0
- tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
- tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- const int n_lora_decay = hparams.n_lora_decay;
- const int n_lora_iclr = hparams.n_lora_iclr;
- const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
- const int n_lora_gate = hparams.n_lora_gate;
- const int attn_hidden_size = n_embd;
- const int ffn_size = hparams.n_ff_arr[0];
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
- layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
- layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
- layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
- layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
- layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
- layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
- layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
- layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
- if (i == 0) {
- // actually not used
- layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
- layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
- layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
- } else {
- layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
- layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
- layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
- }
- layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, 0);
- layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, 0);
- layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
- layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
- layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
- layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
- layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
- layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
- layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
- layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
- layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
- layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
- layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
- layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
- layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
- }
- } break;
- case LLM_ARCH_ARWKV7:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- const int n_lora_decay = hparams.n_lora_decay;
- const int n_lora_iclr = hparams.n_lora_iclr;
- const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
- const int n_lora_gate = hparams.n_lora_gate;
- const int attn_hidden_size = n_embd;
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
- layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
- layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
- layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
- layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
- layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
- if (i == 0) {
- // actually not used
- layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
- layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
- layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
- } else {
- layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
- layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
- layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
- }
- layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, TENSOR_NOT_REQUIRED);
- layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, TENSOR_NOT_REQUIRED);
- try {
- layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
- } catch(std::runtime_error & e) {
- // ARWKV models may not have gate tensors
- layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
- }
- layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
- layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
- layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
- layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
- layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
- layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
- layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_CHAMELEON:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (output == NULL) {
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
- }
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
- layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
- layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_WAVTOKENIZER_DEC:
- {
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
- conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
- conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
- // posnet
- {
- const int64_t n_embd = hparams.posnet.n_embd;
- for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
- auto & layer = layers[i].posnet;
- // posnet:
- //
- // - resnet
- // - resnet
- // - attn
- // - resnet
- // - resnet
- // - norm
- //
- switch (i) {
- case 0:
- case 1:
- case 3:
- case 4:
- {
- layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
- layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
- layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
- layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
- layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
- layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
- layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
- layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
- } break;
- case 2:
- {
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
- layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
- layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
- layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
- layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
- layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
- layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
- layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
- layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
- } break;
- case 5:
- {
- layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
- layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
- } break;
- default: GGML_ABORT("unknown posnet layer");
- };
- }
- }
- GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
- tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
- tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
- // convnext
- {
- const int64_t n_embd = hparams.convnext.n_embd;
- for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
- auto & layer = layers[i].convnext;
- layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
- layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
- layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
- layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
- layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
- layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
- layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
- layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
- layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
- }
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- }
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
- output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
- } break;
- case LLM_ARCH_BAILINGMOE:
- {
- const int64_t n_ff_exp = hparams.n_ff_exp;
- const int64_t n_expert_shared = hparams.n_expert_shared;
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- // output
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = layers[i];
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
- if (n_expert == 0) {
- throw std::runtime_error("n_expert must be > 0");
- }
- if (n_expert_used == 0) {
- throw std::runtime_error("n_expert_used must be > 0");
- }
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
- layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
- layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
- layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
- }
- } break;
- default:
- throw std::runtime_error("unknown architecture");
- }
- if (n_moved_tensors > 0) {
- LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
- __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
- ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
- }
- }
- ml.done_getting_tensors();
- ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
- pimpl->mappings.reserve(ml.mappings.size());
- // create the backend buffers
- std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
- ctx_bufs.reserve(ctx_map.size());
- // Ensure we have enough capacity for the maximum backend buffer we will potentially create
- const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
- pimpl->bufs.reserve(n_max_backend_buffer);
- for (auto & it : ctx_map) {
- ggml_backend_buffer_type_t buft = it.first;
- ggml_context * ctx = it.second;
- // skip contexts without tensors
- if (ggml_get_first_tensor(ctx) == nullptr) {
- continue;
- }
- llama_buf_map buf_map;
- buf_map.reserve(n_max_backend_buffer);
- // check if it is possible to use buffer_from_host_ptr with this buffer type
- ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
- if (!dev) {
- // FIXME: workaround for CPU backend buft having a NULL device
- dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
- }
- ggml_backend_dev_props props;
- ggml_backend_dev_get_props(dev, &props);
- bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
- bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
- if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
- for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
- // only the mmap region containing the tensors in the model is mapped to the backend buffer
- // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
- // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
- void * addr = nullptr;
- size_t first, last; // NOLINT
- ml.get_mapping_range(&first, &last, &addr, idx, ctx);
- if (first >= last) {
- continue;
- }
- const size_t max_size = ggml_get_max_tensor_size(ctx);
- ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
- if (buf == nullptr) {
- throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
- }
- pimpl->bufs.emplace_back(buf);
- buf_map.emplace(idx, buf);
- }
- }
- else {
- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
- if (buf == nullptr) {
- throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
- }
- pimpl->bufs.emplace_back(buf);
- if (use_mlock && ggml_backend_buffer_is_host(buf)) {
- pimpl->mlock_bufs.emplace_back(new llama_mlock);
- auto & mlock_buf = pimpl->mlock_bufs.back();
- mlock_buf->init (ggml_backend_buffer_get_base(buf));
- mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
- }
- for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
- buf_map.emplace(idx, buf);
- }
- }
- if (pimpl->bufs.empty()) {
- throw std::runtime_error("failed to allocate buffer");
- }
- for (auto & buf : buf_map) {
- // indicate that this buffer contains weights
- // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
- ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
- }
- ctx_bufs.emplace_back(ctx, buf_map);
- }
- if (llama_supports_gpu_offload()) {
- const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
- LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
- if (n_gpu_layers > (int) hparams.n_layer) {
- LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
- }
- const int max_backend_supported_layers = hparams.n_layer + 1;
- const int max_offloadable_layers = hparams.n_layer + 1;
- LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
- }
- // print memory requirements per buffer type
- for (auto & buf : pimpl->bufs) {
- LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
- }
- // populate tensors_by_name
- for (auto & ctx : pimpl->ctxs) {
- for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
- tensors_by_name.emplace_back(ggml_get_name(cur), cur);
- }
- }
- // load tensor data
- for (auto & it : ctx_bufs) {
- ggml_context * ctx = it.first;
- auto & bufs = it.second;
- if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
- return false;
- }
- }
- if (use_mmap_buffer) {
- for (auto & mapping : ml.mappings) {
- pimpl->mappings.emplace_back(std::move(mapping));
- }
- }
- return true;
- }
- std::string llama_model::arch_name() const {
- return llm_arch_name(arch);
- }
- std::string llama_model::type_name() const {
- return llm_type_name(type);
- }
- std::string llama_model::desc() const {
- return pimpl->desc_str;
- }
- size_t llama_model::size() const {
- return pimpl->n_bytes;
- }
- size_t llama_model::n_tensors() const {
- return tensors_by_name.size();
- }
- size_t llama_model::n_devices() const {
- return devices.size();
- }
- uint64_t llama_model::n_elements() const {
- return pimpl->n_elements;
- }
- void llama_model::print_info() const {
- const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
- auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
- bool is_var = false;
- std::vector<uint32_t> v;
- for (uint32_t i = 0; i < n; ++i) {
- v.push_back(f(i));
- if (v[i] != v[0]) {
- is_var = true;
- }
- }
- std::stringstream ss;
- if (is_var) {
- ss << "[";
- for (uint32_t i = 0; i < n; ++i) {
- ss << v[i];
- if (i < n - 1) {
- ss << ", ";
- }
- }
- ss << "]";
- } else {
- ss << v[0];
- }
- return ss.str();
- };
- // hparams
- LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
- LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
- if (!hparams.vocab_only) {
- LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
- LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
- LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
- LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
- LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
- LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
- LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
- LLAMA_LOG_INFO("%s: n_swa_pattern = %u\n", __func__, hparams.n_swa_pattern);
- LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
- LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
- LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
- LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
- LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
- LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
- LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
- LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
- LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
- LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
- LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
- LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
- LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
- LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
- LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
- LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
- LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
- LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
- LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
- LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
- LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
- LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
- LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
- LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
- LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
- LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
- LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
- }
- LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
- if (pimpl->n_elements >= 1e12) {
- LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
- } else if (pimpl->n_elements >= 1e9) {
- LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
- } else if (pimpl->n_elements >= 1e6) {
- LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
- } else {
- LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
- }
- // general kv
- LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
- if (arch == LLM_ARCH_DEEPSEEK) {
- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
- }
- if (arch == LLM_ARCH_DEEPSEEK2) {
- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
- LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
- LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
- LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
- LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
- LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
- LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
- LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
- }
- if (arch == LLM_ARCH_QWEN2MOE) {
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
- LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
- }
- if (arch == LLM_ARCH_QWEN3MOE) {
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
- }
- if (arch == LLM_ARCH_MINICPM || arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE) {
- LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
- LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
- LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
- }
- if (arch == LLM_ARCH_BAILINGMOE) {
- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
- LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
- }
- vocab.print_info();
- }
- ggml_backend_dev_t llama_model::dev_layer(int il) const {
- return pimpl->dev_layer.at(il).dev;
- }
- ggml_backend_dev_t llama_model::dev_output() const {
- return pimpl->dev_output.dev;
- }
- template<typename F>
- static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
- ggml_init_params params = {
- /*.mem_size =*/ ggml_tensor_overhead()*8,
- /*.mem_buffer =*/ NULL,
- /*.no_alloc =*/ true,
- };
- ggml_context_ptr ctx { ggml_init(params) };
- if (!ctx) {
- throw std::runtime_error(format("failed to create ggml context"));
- }
- ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
- ggml_tensor * op_tensor = fn(ctx.get());
- for (int i = 0; i < GGML_MAX_SRC; i++) {
- if (op_tensor->src[i] != nullptr) {
- assert(op_tensor->src[i]->buffer == nullptr);
- op_tensor->src[i]->buffer = buf.get();
- }
- }
- bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
- return op_supported;
- }
- template<typename F>
- static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
- for (const auto & cur : buft_list) {
- ggml_backend_dev_t cur_dev = cur.first;
- ggml_backend_buffer_type_t cur_buft = cur.second;
- if (buft_supported(cur_buft, cur_dev, fn)) {
- return cur_buft;
- }
- }
- throw std::runtime_error(format("no suitable buffer type found"));
- }
- ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
- return ::select_buft(
- *pimpl->dev_layer.at(il).buft_list,
- [&](ggml_context * ctx) {
- ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
- ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
- return ggml_add(ctx, cur, layer_dir);
- });
- }
- bool llama_model::has_tensor_overrides() const {
- return pimpl->has_tensor_overrides;
- }
- const ggml_tensor * llama_model::get_tensor(const char * name) const {
- auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
- [name](const std::pair<std::string, ggml_tensor *> & it) {
- return it.first == name;
- });
- if (it == tensors_by_name.end()) {
- return nullptr;
- }
- return it->second;
- }
- ggml_tensor * llama_model::get_rope_factors(uint32_t n_ctx_per_seq, int il) const {
- // choose long/short freq factors based on the context size
- if (layers[il].rope_freqs != nullptr) {
- return layers[il].rope_freqs;
- }
- if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
- return layers[il].rope_long;
- }
- return layers[il].rope_short;
- }
- struct llm_build_llama : public llm_graph_context {
- llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- // temperature tuning
- ggml_tensor * inp_attn_scale = nullptr;
- if (arch == LLM_ARCH_LLAMA4) {
- inp_attn_scale = build_inp_attn_scale();
- }
- auto * inp_attn = build_attn_inp_kv_unified();
- const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- bool use_rope = arch == LLM_ARCH_LLAMA4
- ? (il + 1) % hparams.n_no_rope_layer_step != 0
- : true;
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- // rope freq factors for llama3; may return nullptr for llama2 and other models
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- if (use_rope) {
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- } else if (inp_attn_scale) {
- Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
- }
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- if (arch == LLM_ARCH_LLAMA4 && use_rope && hparams.use_kq_norm) {
- // Llama4TextL2Norm
- Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
- Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
- cb(Qcur, "Qcur_normed", il);
- cb(Kcur, "Kcur_normed", il);
- }
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
- cb(cur, "attn_out", il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- // For Granite architecture
- if (hparams.f_residual_scale) {
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network (non-MoE)
- if (model.layers[il].ffn_gate_inp == nullptr) {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- } else if (arch == LLM_ARCH_LLAMA4) {
- // llama4 MoE
- ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- ggml_tensor * moe_out = build_moe_ffn(ffn_inp_normed,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, false,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
- il);
- // Shared experts
- ggml_tensor * shexp_out = build_ffn(ffn_inp_normed,
- model.layers[il].ffn_up_shexp, NULL, NULL,
- model.layers[il].ffn_gate_shexp, NULL, NULL,
- model.layers[il].ffn_down_shexp, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(shexp_out, "ffn_moe_shexp", il);
- cur = ggml_add(ctx0, moe_out, shexp_out);
- cb(cur, "ffn_moe_out_merged", il);
- } else {
- // MoE branch
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- cur = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, true,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(cur, "ffn_moe_out", il);
- }
- // For Granite architecture
- if (hparams.f_residual_scale) {
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- // For Granite architecture
- if (hparams.f_logit_scale) {
- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
- }
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_deci : public llm_graph_context {
- llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- const int64_t n_head_kv = hparams.n_head_kv(il);
- const int64_t n_head = hparams.n_head(il);
- const int64_t n_ff = hparams.n_ff(il);
- if (n_head == 0) {
- // attention-free layer of Llama-3_1-Nemotron-51B
- cur = inpL;
- } else {
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- }
- if (n_head > 0 && n_head_kv == 0) {
- // "linear attention" of Llama-3_1-Nemotron-51B
- cur = build_lora_mm(model.layers[il].wo, cur);
- cb(cur, "wo", il);
- } else if (n_head > 0) {
- // self-attention
- // rope freq factors for llama3; may return nullptr for llama2 and other models
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- // FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
- if (n_head == 0 && n_ff == 0) {
- continue;
- }
- // For Granite architecture
- if (hparams.f_residual_scale) {
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
- }
- // modified to support attention-free layer of Llama-3_1-Nemotron-51B
- ggml_tensor * ffn_inp = cur;
- if (n_head > 0) {
- ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- }
- // feed-forward network
- if (model.layers[il].ffn_gate_inp == nullptr) {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
- // For Granite architecture
- if (hparams.f_residual_scale) {
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- // For Granite architecture
- if (hparams.f_logit_scale) {
- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
- }
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_baichuan : public llm_graph_context {
- llm_build_baichuan(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- switch (model.type) {
- case LLM_TYPE_7B:
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- break;
- case LLM_TYPE_13B:
- break;
- default:
- GGML_ABORT("fatal error");
- }
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_xverse : public llm_graph_context {
- llm_build_xverse(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_falcon : public llm_graph_context {
- llm_build_falcon(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * attn_norm;
- attn_norm = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(attn_norm, "attn_norm", il);
- // self-attention
- {
- if (model.layers[il].attn_norm_2) {
- // Falcon-40B
- cur = build_norm(inpL,
- model.layers[il].attn_norm_2,
- model.layers[il].attn_norm_2_b,
- LLM_NORM, il);
- cb(cur, "attn_norm_2", il);
- } else {
- cur = attn_norm;
- }
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- // using mode = 2 for neox mode
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
- }
- ggml_tensor * ffn_inp = cur;
- // feed forward
- {
- cur = build_ffn(attn_norm, // !! use the attn norm, not the result
- model.layers[il].ffn_up, NULL, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = ggml_add(ctx0, cur, inpL);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- // norm
- cur = build_norm(cur,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_grok : public llm_graph_context {
- llm_build_grok(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // multiply by embedding_multiplier_scale of 78.38367176906169
- inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- // Grok
- // if attn_out_norm is present then apply it before adding the input
- if (model.layers[il].attn_out_norm) {
- cur = build_norm(cur,
- model.layers[il].attn_out_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_out_norm", il);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network
- // MoE branch
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- cur = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_GELU, true,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(cur, "ffn_moe_out", il);
- // Grok
- // if layer_out_norm is present then apply it before adding the input
- // Idea: maybe ffn_out_norm is a better name
- if (model.layers[il].layer_out_norm) {
- cur = build_norm(cur,
- model.layers[il].layer_out_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "layer_out_norm", il);
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- // Grok
- // multiply logits by output_multiplier_scale of 0.5773502691896257
- cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_dbrx : public llm_graph_context {
- llm_build_dbrx(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- ggml_tensor * Qcur = nullptr;
- ggml_tensor * Kcur = nullptr;
- ggml_tensor * Vcur = nullptr;
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
- cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
- cb(cur, "wqkv_clamped", il);
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network
- // MoE branch
- cur = build_norm(ffn_inp,
- model.layers[il].attn_out_norm, NULL,
- LLM_NORM, il);
- cb(cur, "attn_out_norm", il);
- cur = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, true,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(cur, "ffn_moe_out", il);
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_starcoder : public llm_graph_context {
- llm_build_starcoder(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
- cb(pos, "pos_embd", -1);
- inpL = ggml_add(ctx0, inpL, pos);
- cb(inpL, "inpL", -1);
- for (int il = 0; il < n_layer; ++il) {
- cur = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
- // add the input
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
- // FF
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm,
- model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = build_norm(inpL,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_refact : public llm_graph_context {
- llm_build_refact(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_bert : public llm_graph_context {
- llm_build_bert(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- ggml_tensor * inp_pos = nullptr;
- if (model.arch != LLM_ARCH_JINA_BERT_V2) {
- inp_pos = build_inp_pos();
- }
- // construct input embeddings (token, type, position)
- inpL = build_inp_embd(model.tok_embd);
- // token types are hardcoded to zero ("Sentence A")
- ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
- inpL = ggml_add(ctx0, inpL, type_row0);
- if (model.arch == LLM_ARCH_BERT) {
- inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
- }
- cb(inpL, "inp_embd", -1);
- // embed layer norm
- inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
- cb(inpL, "inp_norm", -1);
- auto * inp_attn = build_attn_inp_no_cache();
- // iterate layers
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * cur = inpL;
- ggml_tensor * Qcur;
- ggml_tensor * Kcur;
- ggml_tensor * Vcur;
- // self-attention
- if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
- Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
- if (model.layers[il].attn_q_norm) {
- Qcur = build_norm(Qcur,
- model.layers[il].attn_q_norm,
- model.layers[il].attn_q_norm_b,
- LLM_NORM, il);
- }
- Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
- if (model.layers[il].attn_k_norm) {
- Kcur = build_norm(Kcur,
- model.layers[il].attn_k_norm,
- model.layers[il].attn_k_norm_b,
- LLM_NORM, il);
- }
- Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- } else {
- // compute Q and K and RoPE them
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
- if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
- }
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- }
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- cb(cur, "kqv_out", il);
- if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
- // re-add the layer input
- cur = ggml_add(ctx0, cur, inpL);
- // attention layer norm
- cur = build_norm(cur, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, il);
- if (model.layers[il].attn_norm_2 != nullptr) {
- cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
- cur = build_norm(cur, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, il);
- }
- ggml_tensor * ffn_inp = cur;
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network
- if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
- // MoE branch
- cur = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- nullptr,
- model.layers[il].ffn_down_exps,
- nullptr,
- hparams.n_expert,
- hparams.n_expert_used,
- LLM_FFN_GELU,
- false, false,
- 0.0f,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
- cb(cur, "ffn_moe_out", il);
- } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
- } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- } else {
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
- // attentions bypass the intermediate layer
- cur = ggml_add(ctx0, cur, ffn_inp);
- // output layer norm
- cur = build_norm(cur, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cb(cur, "result_embd", -1);
- res->t_embd = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_bloom : public llm_graph_context {
- llm_build_bloom(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- auto * inp_attn = build_attn_inp_kv_unified();
- inpL = build_norm(inpL,
- model.tok_norm,
- model.tok_norm_b,
- LLM_NORM, -1);
- cb(inpL, "inp_norm", -1);
- for (int il = 0; il < n_layer; ++il) {
- cur = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
- // Add the input
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
- // FF
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm,
- model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = build_norm(inpL,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_mpt : public llm_graph_context {
- llm_build_mpt(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- ggml_tensor * cur;
- ggml_tensor * pos;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- auto * inp_attn = build_attn_inp_kv_unified();
- if (model.pos_embd) {
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
- cb(pos, "pos_embd", -1);
- inpL = ggml_add(ctx0, inpL, pos);
- cb(inpL, "inpL", -1);
- }
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * attn_norm;
- attn_norm = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(attn_norm, "attn_norm", il);
- // self-attention
- {
- cur = attn_norm;
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
- if (model.layers[il].bqkv){
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
- }
- if (hparams.f_clamp_kqv > 0.0f) {
- cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
- cb(cur, "wqkv_clamped", il);
- }
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- // Q/K Layernorm
- if (model.layers[il].attn_q_norm) {
- Qcur = build_norm(Qcur,
- model.layers[il].attn_q_norm,
- model.layers[il].attn_q_norm_b,
- LLM_NORM, il);
- cb(Qcur, "Qcur", il);
- Kcur = build_norm(Kcur,
- model.layers[il].attn_k_norm,
- model.layers[il].attn_k_norm_b,
- LLM_NORM, il);
- cb(Kcur, "Kcur", il);
- }
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
- // Add the input
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
- // feed forward
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm,
- model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- model.layers[il].ffn_act,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_stablelm : public llm_graph_context {
- llm_build_stablelm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
- ggml_tensor * inpSA = cur;
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- if (model.layers[il].attn_q_norm) {
- Qcur = build_norm(Qcur,
- model.layers[il].attn_q_norm,
- NULL,
- LLM_NORM, il);
- cb(Qcur, "Qcur", il);
- }
- if (model.layers[il].attn_k_norm) {
- Kcur = build_norm(Kcur,
- model.layers[il].attn_k_norm,
- NULL,
- LLM_NORM, il);
- cb(Kcur, "Kcur", il);
- }
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network
- {
- if (model.layers[il].ffn_norm) {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm,
- model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
- } else {
- // parallel residual
- cur = inpSA;
- }
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_qwen : public llm_graph_context {
- llm_build_qwen(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- // using mode = 2 for neox mode
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward forward
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_qwen2 : public llm_graph_context {
- llm_build_qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_qwen2vl : public llm_graph_context {
- llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- int sections[4];
- std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_multi(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_multi(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_qwen2moe : public llm_graph_context {
- llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self_attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // MoE branch
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- ggml_tensor * moe_out =
- build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, false,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(moe_out, "ffn_moe_out", il);
- // FFN shared expert
- {
- ggml_tensor * cur_gate_inp = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur);
- cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
- // sigmoid
- ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
- cb(cur_gate, "ffn_shexp_gate", il);
- ggml_tensor * cur_ffn = build_ffn(cur,
- model.layers[il].ffn_up_shexp, NULL, NULL,
- model.layers[il].ffn_gate_shexp, NULL, NULL,
- model.layers[il].ffn_down_shexp, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur_ffn, "ffn_shexp", il);
- ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
- cb(ffn_shexp_out, "ffn_shexp_out", il);
- moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
- cb(moe_out, "ffn_out", il);
- cur = moe_out;
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_qwen3 : public llm_graph_context {
- llm_build_qwen3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_normed", il);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
- cb(Kcur, "Kcur_normed", il);
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_qwen3moe : public llm_graph_context {
- llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self_attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_normed", il);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
- cb(Kcur, "Kcur_normed", il);
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // MoE branch
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- ggml_tensor * moe_out =
- build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, true,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(moe_out, "ffn_moe_out", il);
- cur = moe_out;
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_phi2 : public llm_graph_context {
- llm_build_phi2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- ggml_tensor * cur;
- ggml_tensor * attn_norm_output;
- ggml_tensor * ffn_output;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- attn_norm_output = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(attn_norm_output, "attn_norm", il);
- // self-attention
- {
- ggml_tensor * Qcur = nullptr;
- ggml_tensor * Kcur = nullptr;
- ggml_tensor * Vcur = nullptr;
- if (model.layers[il].wqkv) {
- cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
- cb(cur, "wqkv", il);
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
- } else {
- Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
- Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
- Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
- }
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- // with phi2, we scale the Q to avoid precision issues
- // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
- Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
- }
- // FF
- {
- ffn_output = build_ffn(attn_norm_output,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
- cb(ffn_output, "ffn_out", il);
- }
- cur = ggml_add(ctx0, cur, ffn_output);
- cur = ggml_add(ctx0, cur, inpL);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = build_norm(inpL,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output_no_bias", -1);
- cur = ggml_add(ctx0, cur, model.output_b);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_phi3 : public llm_graph_context {
- llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- auto * residual = inpL;
- // self-attention
- {
- // rope freq factors for 128k context
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
- ggml_tensor* attn_norm_output = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM_RMS, il);
- cb(attn_norm_output, "attn_norm", il);
- ggml_tensor * Qcur = nullptr;
- ggml_tensor * Kcur = nullptr;
- ggml_tensor * Vcur = nullptr;
- if (model.layers[il].wqkv) {
- cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
- cb(cur, "wqkv", il);
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
- } else {
- Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
- Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
- Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
- }
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
- cb(Qcur, "Qcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor* inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- residual = ggml_get_rows(ctx0, residual, inp_out_ids);
- }
- cur = ggml_add(ctx0, cur, residual);
- residual = cur;
- cur = build_norm(cur,
- model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- // feed-forward network
- if (model.layers[il].ffn_gate_inp == nullptr) {
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
- } else {
- // MoE branch
- cur = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, true,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(cur, "ffn_moe_out", il);
- }
- cur = ggml_add(ctx0, residual, cur);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = build_norm(inpL,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- cur = build_lora_mm(model.output, cur);
- if (model.output_b != nullptr) {
- cb(cur, "result_output_no_bias", -1);
- cur = ggml_add(ctx0, cur, model.output_b);
- }
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_plamo : public llm_graph_context {
- llm_build_plamo(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- ggml_tensor * attention_norm = cur;
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- ggml_tensor * sa_out = cur;
- cur = attention_norm;
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
- // feed-forward network
- {
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
- cur = ggml_add(ctx0, cur, sa_out);
- cur = ggml_add(ctx0, cur, inpL);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_gpt2 : public llm_graph_context {
- llm_build_gpt2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- ggml_tensor * cur;
- ggml_tensor * pos;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
- cb(pos, "pos_embd", -1);
- inpL = ggml_add(ctx0, inpL, pos);
- cb(inpL, "inpL", -1);
- for (int il = 0; il < n_layer; ++il) {
- cur = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
- // add the input
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
- // FF
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm,
- model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = build_norm(inpL,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_codeshell : public llm_graph_context {
- llm_build_codeshell(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- cur = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
- // add the input
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
- // FF
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm,
- model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = build_norm(inpL,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_orion : public llm_graph_context {
- llm_build_orion(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- // if (model.layers[il].bq) {
- // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- // cb(Qcur, "Qcur", il);
- // }
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- // if (model.layers[il].bk) {
- // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- // cb(Kcur, "Kcur", il);
- // }
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- // if (model.layers[il].bv) {
- // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- // cb(Vcur, "Vcur", il);
- // }
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, model.output_norm_b,
- LLM_NORM, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_internlm2 : public llm_graph_context {
- llm_build_internlm2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_minicpm3 : public llm_graph_context {
- llm_build_minicpm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- //TODO: if the model varies, these parameters need to be read from the model
- const int64_t n_embd_base = 256;
- const float scale_embd = 12.0f;
- const float scale_depth = 1.4f;
- const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
- const uint32_t n_embd_head_qk_rope = hparams.n_rot;
- const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
- const uint32_t kv_lora_rank = hparams.n_lora_kv;
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // scale the input embeddings
- inpL = ggml_scale(ctx0, inpL, scale_embd);
- cb(inpL, "inp_scaled", -1);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self_attention
- {
- ggml_tensor * q = NULL;
- // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
- q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
- cb(q, "q", il);
- q = build_norm(q,
- model.layers[il].attn_q_a_norm, NULL,
- LLM_NORM_RMS, il);
- cb(q, "q", il);
- // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
- q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
- cb(q, "q", il);
- // split into {n_head * n_embd_head_qk_nope, n_tokens}
- ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
- ggml_row_size(q->type, hparams.n_embd_head_k),
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
- 0);
- cb(q_nope, "q_nope", il);
- // and {n_head * n_embd_head_qk_rope, n_tokens}
- ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
- ggml_row_size(q->type, hparams.n_embd_head_k),
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
- ggml_row_size(q->type, n_embd_head_qk_nope));
- cb(q_pe, "q_pe", il);
- // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
- ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
- cb(kv_pe_compresseed, "kv_pe_compresseed", il);
- // split into {kv_lora_rank, n_tokens}
- ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
- kv_pe_compresseed->nb[1],
- 0);
- cb(kv_compressed, "kv_compressed", il);
- // and {n_embd_head_qk_rope, n_tokens}
- ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
- kv_pe_compresseed->nb[1],
- kv_pe_compresseed->nb[1],
- ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
- cb(k_pe, "k_pe", il);
- // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
- kv_compressed = ggml_cont(ctx0, kv_compressed);
- kv_compressed = build_norm(kv_compressed,
- model.layers[il].attn_kv_a_norm, NULL,
- LLM_NORM_RMS, il);
- cb(kv_compressed, "kv_compressed", il);
- // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
- ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
- cb(kv, "kv", il);
- // split into {n_head * n_embd_head_qk_nope, n_tokens}
- ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
- ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
- ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
- 0);
- cb(k_nope, "k_nope", il);
- // and {n_head * n_embd_head_v, n_tokens}
- ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
- ggml_row_size(kv->type, (n_embd_head_qk_nope)));
- cb(v_states, "v_states", il);
- v_states = ggml_cont(ctx0, v_states);
- cb(v_states, "v_states", il);
- v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
- ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
- 0);
- cb(v_states, "v_states", il);
- q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
- q_pe = ggml_rope_ext(
- ctx0, q_pe, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(q_pe, "q_pe", il);
- // shared RoPE key
- k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
- k_pe = ggml_rope_ext(
- ctx0, k_pe, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(k_pe, "k_pe", il);
- ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
- cb(q_states, "q_states", il);
- ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
- cb(k_states, "k_states", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- // scale_res - scale the hidden states for residual connection
- const float scale_res = scale_depth/sqrtf(float(n_layer));
- cur = ggml_scale(ctx0, cur, scale_res);
- cb(cur, "hidden_scaled", il);
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
- // scale the hidden states for residual connection
- cur = ggml_scale(ctx0, cur, scale_res);
- cb(cur, "hidden_scaled_ffn", il);
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head scaling
- const float scale_lmhead = float(n_embd_base)/float(n_embd);
- cur = ggml_scale(ctx0, cur, scale_lmhead);
- cb(cur, "lmhead_scaling", -1);
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_gemma : public llm_graph_context {
- llm_build_gemma(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
- cb(inpL, "inp_scaled", -1);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow);
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow);
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
- cb(Qcur, "Qcur_scaled", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
- ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
- cb(sa_out, "sa_out", il);
- cur = build_norm(sa_out,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- // feed-forward network
- {
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
- cur = ggml_add(ctx0, cur, sa_out);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_gemma2 : public llm_graph_context {
- llm_build_gemma2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_k;
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
- cb(inpL, "inp_scaled", -1);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow);
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow);
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
- switch (model.type) {
- case LLM_TYPE_2B:
- case LLM_TYPE_9B:
- case LLM_TYPE_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head))); break;
- default: GGML_ABORT("fatal error");
- };
- cb(Qcur, "Qcur_scaled", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
- }
- cur = build_norm(cur,
- model.layers[il].attn_post_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_post_norm", il);
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
- ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
- cb(sa_out, "sa_out", il);
- cur = build_norm(sa_out,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- // feed-forward network
- {
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
- cur = build_norm(cur,
- model.layers[il].ffn_post_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "ffn_post_norm", -1);
- cur = ggml_add(ctx0, cur, sa_out);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- // final logit soft-capping
- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
- cur = ggml_tanh(ctx0, cur);
- cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_gemma3 : public llm_graph_context {
- llm_build_gemma3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_k;
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
- if (ubatch.token) {
- inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
- cb(inpL, "inp_scaled", -1);
- }
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- // TODO: is causal == true correct? might need some changes
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- const bool is_swa = hparams.is_swa(il);
- const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
- const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
- // norm
- cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_normed", il);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
- ext_factor, attn_factor, beta_fast, beta_slow);
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
- cb(Kcur, "Kcur_normed", il);
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
- ext_factor, attn_factor, beta_fast, beta_slow);
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
- }
- cur = build_norm(cur,
- model.layers[il].attn_post_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_post_norm", il);
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
- ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
- cb(sa_out, "sa_out", il);
- cur = build_norm(sa_out,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- // feed-forward network
- {
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
- cur = build_norm(cur,
- model.layers[il].ffn_post_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "ffn_post_norm", -1);
- cur = ggml_add(ctx0, cur, sa_out);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- // TODO: move up next to build_starcoder
- struct llm_build_starcoder2 : public llm_graph_context {
- llm_build_starcoder2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, model.output_norm_b,
- LLM_NORM, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_mamba : public llm_graph_context {
- const llama_model & model;
- llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) {
- ggml_tensor * cur;
- ggml_tensor * inpL;
- // {n_embd, n_tokens}
- inpL = build_inp_embd(model.tok_embd);
- ggml_tensor * state_copy = build_inp_s_copy();
- ggml_tensor * state_mask = build_inp_s_mask();
- for (int il = 0; il < n_layer; ++il) {
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il);
- cur = build_mamba_layer(gf, cur, state_copy, state_mask, ubatch, il);
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
- // residual
- cur = ggml_add(ctx0, cur, inpL);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- // final rmsnorm
- cur = build_norm(inpL,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- // TODO: split
- ggml_tensor * build_mamba_layer(
- ggml_cgraph * gf,
- ggml_tensor * cur,
- ggml_tensor * state_copy,
- ggml_tensor * state_mask,
- const llama_ubatch & ubatch,
- int il) const {
- const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
- const auto kv_head = kv_self->head;
- const int64_t d_conv = hparams.ssm_d_conv;
- const int64_t d_inner = hparams.ssm_d_inner;
- const int64_t d_state = hparams.ssm_d_state;
- const int64_t dt_rank = hparams.ssm_dt_rank;
- const int64_t n_seqs = ubatch.n_seqs;
- // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
- const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
- // Use the same RMS norm as the final layer norm
- const float norm_rms_eps = hparams.f_norm_rms_eps;
- const int64_t n_seq_tokens = ubatch.n_seq_tokens;
- GGML_ASSERT(n_seqs != 0);
- GGML_ASSERT(ubatch.equal_seqs);
- GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
- ggml_tensor * conv_states_all = kv_self->k_l[il];
- ggml_tensor * ssm_states_all = kv_self->v_l[il];
- // (ab)using the KV cache to store the states
- ggml_tensor * conv = build_copy_mask_state(
- gf, conv_states_all, state_copy, state_mask,
- hparams.n_embd_k_s(), n_seqs);
- conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
- ggml_tensor * ssm = build_copy_mask_state(
- gf, ssm_states_all, state_copy, state_mask,
- hparams.n_embd_v_s(), n_seqs);
- ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
- // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
- cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
- // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
- ggml_tensor * xz = build_lora_mm(model.layers[il].ssm_in, cur);
- // split the above in two
- // => {d_inner, n_seq_tokens, n_seqs}
- ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
- ggml_tensor * z = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz));
- // conv
- {
- // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
- ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
- // copy last (d_conv - 1) columns back into the state cache
- ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
- ggml_build_forward_expand(gf,
- ggml_cpy(ctx0, last_conv,
- ggml_view_1d(ctx0, conv_states_all,
- (d_conv - 1)*(d_inner)*(n_seqs),
- kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
- // 1D convolution
- // The equivalent is to make a self-overlapping view of conv_x
- // over d_conv columns at each stride in the 3rd dimension,
- // then element-wise multiply that with the conv1d weight,
- // then sum the elements of each row,
- // (the last two steps are a dot product over rows (also doable with mul_mat))
- // then permute away the ne[0] dimension,
- // and then you're left with the resulting x tensor.
- // For simultaneous sequences, all sequences need to have the same length.
- x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
- // bias
- x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b);
- x = ggml_silu(ctx0, x);
- }
- // ssm
- {
- // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
- ggml_tensor * x_db = build_lora_mm(model.layers[il].ssm_x, x);
- // split
- ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
- ggml_tensor * B = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
- ggml_tensor * C = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
- // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
- if (ssm_dt_b_c_rms) {
- dt = ggml_rms_norm(ctx0, dt, norm_rms_eps);
- B = ggml_rms_norm(ctx0, B, norm_rms_eps);
- C = ggml_rms_norm(ctx0, C, norm_rms_eps);
- }
- // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
- dt = build_lora_mm(model.layers[il].ssm_dt, dt);
- dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
- // Custom operator to optimize the parallel associative scan
- // as described in the Annex D of the Mamba paper.
- // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
- ggml_tensor * y_ssm = ggml_ssm_scan(ctx0, ssm, x, dt, model.layers[il].ssm_a, B, C);
- // store last states
- ggml_build_forward_expand(gf,
- ggml_cpy(ctx0,
- ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]),
- ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
- ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0);
- // TODO: skip computing output earlier for unused tokens
- // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
- y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
- y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z)));
- // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
- cur = build_lora_mm(model.layers[il].ssm_out, y);
- }
- // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
- cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
- //cb(cur, "mamba_out", il);
- return cur;
- }
- };
- struct llm_build_command_r : public llm_graph_context {
- llm_build_command_r(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- const float f_logit_scale = hparams.f_logit_scale;
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
- ggml_tensor * ffn_inp = cur;
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- if (model.layers[il].attn_q_norm) {
- Qcur = build_norm(Qcur,
- model.layers[il].attn_q_norm,
- NULL,
- LLM_NORM, il);
- cb(Qcur, "Qcur", il);
- }
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- if (model.layers[il].attn_k_norm) {
- Kcur = build_norm(Kcur,
- model.layers[il].attn_k_norm,
- NULL,
- LLM_NORM, il);
- cb(Kcur, "Kcur", il);
- }
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
- }
- ggml_tensor * attn_out = cur;
- // feed-forward network
- {
- cur = build_ffn(ffn_inp,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
- // add together residual + FFN + self-attention
- cur = ggml_add(ctx0, cur, inpL);
- cur = ggml_add(ctx0, cur, attn_out);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- if (f_logit_scale) {
- cur = ggml_scale(ctx0, cur, f_logit_scale);
- }
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_cohere2 : public llm_graph_context {
- llm_build_cohere2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- const float f_logit_scale = hparams.f_logit_scale;
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- const bool is_swa = hparams.is_swa(il);
- // norm
- cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
- cb(cur, "attn_norm", il);
- ggml_tensor * ffn_inp = cur;
- // self-attention
- {
- // rope freq factors for 128k context
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- if (is_swa) {
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- }
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
- }
- ggml_tensor * attn_out = cur;
- // feed-forward network
- {
- cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
- NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR,
- il);
- cb(cur, "ffn_out", il);
- }
- // add together residual + FFN + self-attention
- cur = ggml_add(ctx0, cur, inpL);
- cur = ggml_add(ctx0, cur, attn_out);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- if (f_logit_scale) {
- cur = ggml_scale(ctx0, cur, f_logit_scale);
- }
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- // ref: https://allenai.org/olmo
- // based on the original build_llama() function, changes:
- // * non-parametric layer norm
- // * clamp qkv
- // * removed bias
- // * removed MoE
- struct llm_build_olmo : public llm_graph_context {
- llm_build_olmo(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // norm
- cur = build_norm(inpL,
- NULL, NULL,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (hparams.f_clamp_kqv > 0.0f) {
- Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
- cb(Qcur, "Qcur", il);
- }
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (hparams.f_clamp_kqv > 0.0f) {
- Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
- cb(Kcur, "Kcur", il);
- }
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (hparams.f_clamp_kqv > 0.0f) {
- Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
- cb(Vcur, "Vcur", il);
- }
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, nullptr,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network
- cur = build_norm(ffn_inp,
- NULL, NULL,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- NULL, NULL,
- LLM_NORM, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_olmo2 : public llm_graph_context {
- llm_build_olmo2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- cur = inpL;
- // self_attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
- LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_normed", il);
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
- LLM_NORM_RMS, il);
- cb(Kcur, "Kcur_normed", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- cur = build_norm(cur,
- model.layers[il].attn_post_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_post_norm", il);
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network
- cur = build_ffn(ffn_inp,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- cur = build_norm(cur,
- model.layers[il].ffn_post_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "ffn_post_norm", -1);
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- // based on the build_qwen2moe() function, changes:
- // * removed shared experts
- // * removed bias
- // * added q, k norm
- struct llm_build_olmoe : public llm_graph_context {
- llm_build_olmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self_attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
- LLM_NORM_RMS, il);
- cb(Qcur, "Qcur_normed", il);
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
- LLM_NORM_RMS, il);
- cb(Kcur, "Kcur_normed", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // MoE branch
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- cur = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, false,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(cur, "ffn_moe_out", il);
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_openelm : public llm_graph_context {
- llm_build_openelm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- const int64_t n_head = hparams.n_head(il);
- const int64_t n_head_kv = hparams.n_head_kv(il);
- const int64_t n_head_qkv = 2*n_head_kv + n_head;
- cur = inpL;
- ggml_tensor * residual = cur;
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
- cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0));
- cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head));
- cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
- cb(Vcur, "Vcur", il);
- Qcur = build_norm(Qcur,
- model.layers[il].attn_q_norm, NULL,
- LLM_NORM_RMS, il);
- cb(Qcur, "Qcur", il);
- Kcur = build_norm(Kcur,
- model.layers[il].attn_k_norm, NULL,
- LLM_NORM_RMS, il);
- cb(Kcur, "Kcur", il);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, NULL,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, NULL,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Qcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- residual = ggml_get_rows(ctx0, residual, inp_out_ids);
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- inpL = cur;
- }
- cur = inpL;
- // norm
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_gptneox : public llm_graph_context {
- llm_build_gptneox(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- cur = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
- // ffn
- if (hparams.use_par_res) {
- // attention and ffn are computed in parallel
- // x = x + attn(ln1(x)) + ffn(ln2(x))
- ggml_tensor * attn_out = cur;
- cur = build_norm(inpL,
- model.layers[il].ffn_norm,
- model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
- cur = ggml_add(ctx0, cur, inpL);
- cb(cur, "ffn_out", il);
- cur = ggml_add(ctx0, cur, attn_out);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- } else {
- // attention and ffn are computed sequentially
- // x = x + attn(ln1(x))
- // x = x + ffn(ln2(x))
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm,
- model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- }
- cur = build_norm(inpL,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_arctic : public llm_graph_context {
- llm_build_arctic(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
- cb(ffn_out, "ffn_out", il);
- // MoE
- cur = build_norm(inpSA,
- model.layers[il].ffn_norm_exps, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm_exps", il);
- cur = build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, true,
- false, 0.0,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(cur, "ffn_moe_out", il);
- cur = ggml_add(ctx0, cur, ffn_out);
- cb(cur, "ffn_out", il);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_deepseek : public llm_graph_context {
- llm_build_deepseek(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- // rope freq factors for llama3; may return nullptr for llama2 and other models
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- if ((uint32_t) il < hparams.n_layer_dense_lead) {
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- } else {
- // MoE branch
- ggml_tensor * moe_out =
- build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, false,
- false, hparams.expert_weights_scale,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(moe_out, "ffn_moe_out", il);
- // FFN shared expert
- {
- ggml_tensor * ffn_shexp = build_ffn(cur,
- model.layers[il].ffn_up_shexp, NULL, NULL,
- model.layers[il].ffn_gate_shexp, NULL, NULL,
- model.layers[il].ffn_down_shexp, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(ffn_shexp, "ffn_shexp", il);
- cur = ggml_add(ctx0, moe_out, ffn_shexp);
- cb(cur, "ffn_out", il);
- }
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_deepseek2 : public llm_graph_context {
- llm_build_deepseek2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- bool is_lite = (hparams.n_layer == 27);
- const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
- // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
- const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
- const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
- const int64_t n_embd_head_qk_rope = hparams.n_rot;
- const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
- const uint32_t kv_lora_rank = hparams.n_lora_kv;
- // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
- // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
- const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
- const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(n_embd_head_k));
- const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
- ggml_tensor * cur;
- ggml_tensor * inpL;
- // {n_embd, n_tokens}
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self_attention
- {
- ggml_tensor * q = NULL;
- if (!is_lite) {
- q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
- cb(q, "q", il);
- q = build_norm(q,
- model.layers[il].attn_q_a_norm, nullptr,
- LLM_NORM_RMS, il);
- cb(q, "q", il);
- q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
- cb(q, "q", il);
- } else {
- q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
- cb(q, "q", il);
- }
- // split into {n_embd_head_qk_nope, n_head, n_tokens}
- ggml_tensor * q_nope = ggml_view_3d(ctx0, q,
- n_embd_head_qk_nope, n_head, n_tokens,
- ggml_row_size(q->type, n_embd_head_k),
- ggml_row_size(q->type, n_embd_head_k) * n_head,
- 0);
- cb(q_nope, "q_nope", il);
- // and {n_embd_head_qk_rope, n_head, n_tokens}
- ggml_tensor * q_pe = ggml_view_3d(ctx0, q,
- n_embd_head_qk_rope, n_head, n_tokens,
- ggml_row_size(q->type, n_embd_head_k),
- ggml_row_size(q->type, n_embd_head_k) * n_head,
- ggml_row_size(q->type, n_embd_head_qk_nope));
- cb(q_pe, "q_pe", il);
- ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
- cb(kv_cmpr_pe, "kv_cmpr_pe", il);
- // split into {kv_lora_rank, n_tokens}
- ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_cmpr_pe,
- kv_lora_rank, n_tokens,
- ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
- 0);
- cb(kv_cmpr, "kv_cmpr", il);
- // and {n_embd_head_qk_rope, 1, n_tokens}
- ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe,
- n_embd_head_qk_rope, 1, n_tokens,
- ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
- ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
- ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
- cb(k_pe, "k_pe", il);
- q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(q_pe, "q_pe", il);
- k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(k_pe, "k_pe", il);
- kv_cmpr = build_norm(kv_cmpr,
- model.layers[il].attn_kv_a_norm, nullptr,
- LLM_NORM_RMS, il);
- cb(kv_cmpr, "kv_cmpr", il);
- if (is_mla) {
- // {n_embd_head_qk_nope, n_tokens, n_head}
- q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
- cb(q_nope, "q_nope_perm", il);
- // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
- ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
- cb(q_nope_absorbed, "q_nope_absorbed", il);
- // {kv_lora_rank, n_head, n_tokens}
- q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
- cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
- // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
- // note: rope must go first for in-place context shifting in build_rope_shift()
- ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
- cb(Qcur, "Qcur", il);
- kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
- cb(kv_cmpr, "kv_cmpr_reshape", il);
- // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
- ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
- cb(Kcur, "Kcur", il);
- // {kv_lora_rank, 1, n_tokens}
- ggml_tensor * Vcur = kv_cmpr;
- cb(Vcur, "Vcur", il);
- // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
- } else {
- ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
- cb(kv, "kv", il);
- // split into {n_embd_head_qk_nope, n_head, n_tokens}
- ggml_tensor * k_nope = ggml_view_3d(ctx0, kv,
- n_embd_head_qk_nope, n_head, n_tokens,
- ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
- ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
- 0);
- cb(k_nope, "k_nope_view", il);
- // and {n_embd_head_v, n_head, n_tokens}
- ggml_tensor * Vcur = ggml_view_3d(ctx0, kv,
- n_embd_head_v, n_head, n_tokens,
- ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
- ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
- ggml_row_size(kv->type, n_embd_head_qk_nope));
- cb(Vcur, "Vcur_view", il);
- Vcur = ggml_cont(ctx0, Vcur);
- cb(Vcur, "Vcur_cont", il);
- // note: rope must go first for in-place context shifting in build_rope_shift()
- ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0);
- cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
- cb(Kcur, "Kcur", il);
- // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
- }
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- if ((uint32_t) il < hparams.n_layer_dense_lead) {
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- } else {
- // MoE branch
- ggml_tensor * moe_out =
- build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- model.layers[il].ffn_exp_probs_b,
- n_expert, n_expert_used,
- LLM_FFN_SILU, hparams.expert_weights_norm,
- true, hparams.expert_weights_scale,
- (llama_expert_gating_func_type) hparams.expert_gating_func,
- il);
- cb(moe_out, "ffn_moe_out", il);
- // FFN shared expert
- {
- ggml_tensor * ffn_shexp = build_ffn(cur,
- model.layers[il].ffn_up_shexp, NULL, NULL,
- model.layers[il].ffn_gate_shexp, NULL, NULL,
- model.layers[il].ffn_down_shexp, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(ffn_shexp, "ffn_shexp", il);
- cur = ggml_add(ctx0, moe_out, ffn_shexp);
- cb(cur, "ffn_out", il);
- }
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = ggml_mul_mat(ctx0, model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_bitnet : public llm_graph_context {
- llm_build_bitnet(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- if (model.layers[il].wq_scale) {
- Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
- }
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
- // B1.K
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- if (model.layers[il].wk_scale) {
- Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
- }
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
- // B1.V
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- if (model.layers[il].wv_scale) {
- Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
- }
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- NULL, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- cur = build_norm(cur,
- model.layers[il].attn_sub_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_sub_norm", il);
- cur = build_lora_mm(model.layers[il].wo, cur);
- if (model.layers[il].wo_scale) {
- cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
- }
- if (model.layers[il].bo) {
- cur = ggml_add(ctx0, cur, model.layers[il].bo);
- }
- cb(cur, "attn_o_out", il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward forward
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale,
- model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
- NULL, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_sub_out", il);
- cur = build_norm(cur,
- model.layers[il].ffn_sub_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_sub_norm", il);
- cur = build_lora_mm(model.layers[il].ffn_down, cur);
- if (model.layers[il].ffn_down_scale) {
- cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
- }
- cb(cur, "ffn_down", il);
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- // FIXME: do not use model.tok_embd directly, duplicate as model.output
- cur = build_lora_mm(model.tok_embd, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_t5_enc : public llm_graph_context {
- llm_build_t5_enc(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- ggml_tensor * pos_bucket_enc = build_inp_pos_bucket_enc();
- auto * inp_attn = build_attn_inp_no_cache();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm_enc, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur);
- cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur);
- cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur);
- cb(Vcur, "Vcur", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
- ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo_enc, nullptr,
- Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
- cb(cur, "kqv_out", il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm_enc, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- // T5 uses relu, flan-T5 uses gelu-gated
- cur = build_ffn(cur,
- model.layers[il].ffn_up_enc, NULL, NULL,
- model.layers[il].ffn_gate_enc, NULL, NULL,
- model.layers[il].ffn_down_enc, NULL, NULL,
- NULL,
- model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
- model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
- il);
- cb(cur, "ffn_out", il);
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cb(cur, "result_embd", -1);
- cur = build_norm(cur,
- model.output_norm_enc, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_t5_dec : public llm_graph_context {
- llm_build_t5_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- //const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- ggml_tensor * embd_enc = build_inp_cross_embd();
- ggml_tensor * pos_bucket_dec = build_inp_pos_bucket_dec();
- const int64_t n_outputs_enc = embd_enc->ne[1];
- auto * inp_attn_self = build_attn_inp_kv_unified();
- auto * inp_attn_cross = build_attn_inp_cross();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
- ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b);
- cur = build_attn(inp_attn_self, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
- cb(cur, "kqv_out", il);
- }
- cur = ggml_add(ctx0, cur, inpSA);
- cb(cur, "cross_inp", il);
- ggml_tensor * inpCA = cur;
- // norm
- cur = build_norm(cur,
- model.layers[il].attn_norm_cross, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm_cross", il);
- // cross-attention
- {
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur);
- cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc);
- cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc);
- cb(Vcur, "Vcur", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_outputs_enc);
- cur = build_attn(inp_attn_cross, gf,
- model.layers[il].wo_cross, nullptr,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
- cb(cur, "kqv_out", il);
- //ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
- //ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
- //ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
- //cb(kq, "kq", il);
- //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
- //cb(kq, "kq_soft_max_ext", il);
- //ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
- //cb(v, "v", il);
- //ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
- //cb(kqv, "kqv", il);
- //ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
- //cb(kqv_merged, "kqv_merged", il);
- //cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
- //cb(cur, "kqv_merged_cont", il);
- //ggml_build_forward_expand(gf, cur);
- //cur = build_lora_mm(model.layers[il].wo_cross, cur);
- //cb(cur, "kqv_out", il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- // T5 uses relu, flan-T5 uses gelu-gated
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
- model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
- il);
- cb(cur, "ffn_out", il);
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cb(cur, "result_embd", -1);
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_jais : public llm_graph_context {
- llm_build_jais(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- cur = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd)));
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd)));
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa)));
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
- }
- // add the input
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
- // FF
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm,
- model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- }
- inpL = ggml_add(ctx0, cur, ffn_inp);
- cb(inpL, "l_out", il);
- }
- cur = build_norm(inpL,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_chatglm : public llm_graph_context {
- llm_build_chatglm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- cur = build_norm(inpL,
- model.layers[il].attn_norm,
- NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- ggml_tensor * Qcur = nullptr;
- ggml_tensor * Kcur = nullptr;
- ggml_tensor * Vcur = nullptr;
- if (model.layers[il].wqkv == nullptr) {
- Qcur = build_lora_mm(model.layers[il].wq, cur);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- }
- Kcur = build_lora_mm(model.layers[il].wk, cur);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- }
- Vcur = build_lora_mm(model.layers[il].wv, cur);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- }
- } else {
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
- if (model.layers[il].bqkv) {
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
- }
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
- }
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- // Add the input
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // FF
- {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm,
- NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
- }
- inpL = ggml_add(ctx0, cur, ffn_inp);
- cb(inpL, "l_out", il);
- }
- cur = build_norm(inpL,
- model.output_norm,
- NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_glm4 : public llm_graph_context {
- llm_build_glm4(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // Pre-attention norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm,
- NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- ggml_tensor * Qcur = nullptr;
- ggml_tensor * Kcur = nullptr;
- ggml_tensor * Vcur = nullptr;
- if (model.layers[il].wqkv == nullptr) {
- Qcur = build_lora_mm(model.layers[il].wq, cur);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- }
- Kcur = build_lora_mm(model.layers[il].wk, cur);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- }
- Vcur = build_lora_mm(model.layers[il].wv, cur);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- }
- } else {
- cur = build_lora_mm(model.layers[il].wqkv, cur);
- cb(cur, "wqkv", il);
- if (model.layers[il].bqkv) {
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
- cb(cur, "bqkv", il);
- }
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
- }
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- // Post-attention norm (new!)
- cur = build_norm(cur,
- model.layers[il].attn_post_norm,
- NULL,
- LLM_NORM_RMS, il);
- cb(cur, "post_attn_norm", il);
- // Add the input (residual connection after post-attention norm)
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // FF
- {
- // Pre-MLP norm
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm,
- NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- // MLP
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
- // Post-MLP norm
- cur = build_norm(cur,
- model.layers[il].ffn_post_norm,
- NULL,
- LLM_NORM_RMS, il);
- cb(cur, "post_mlp_norm", il);
- }
- // Add residual connection after post-MLP norm
- inpL = ggml_add(ctx0, cur, ffn_inp);
- cb(inpL, "l_out", il);
- }
- // Final norm
- cur = build_norm(inpL,
- model.output_norm,
- NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // Output projection
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_nemotron : public llm_graph_context {
- llm_build_nemotron(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- //GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm,
- model.layers[il].attn_norm_b,
- LLM_NORM, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm,
- model.layers[il].ffn_norm_b,
- LLM_NORM, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
- NULL,
- LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, model.output_norm_b,
- LLM_NORM, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_exaone : public llm_graph_context {
- llm_build_exaone(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- // rope freq factors for llama3; may return nullptr for llama2 and other models
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_rwkv6_base : public llm_graph_context {
- const llama_model & model;
- llm_build_rwkv6_base(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
- }
- ggml_tensor * build_rwkv6_channel_mix(
- const llama_layer * layer,
- ggml_tensor * cur,
- ggml_tensor * x_prev,
- llm_arch arch) const {
- ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
- switch (arch) {
- case LLM_ARCH_RWKV6:
- {
- ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
- ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur);
- ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr));
- ggml_tensor * k = ggml_sqr(
- ctx0,
- ggml_relu(
- ctx0,
- build_lora_mm(layer->channel_mix_key, xk)
- )
- );
- cur = ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k));
- } break;
- default:
- GGML_ABORT("fatal error");
- }
- return cur;
- }
- ggml_tensor * build_rwkv6_time_mix(
- ggml_cgraph * gf,
- ggml_tensor * cur,
- ggml_tensor * x_prev,
- ggml_tensor * state_copy,
- ggml_tensor * state_mask,
- const llama_ubatch & ubatch,
- int il) const {
- const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
- const auto n_tokens = ubatch.n_tokens;
- const auto n_seqs = ubatch.n_seqs;
- const auto n_seq_tokens = ubatch.n_seq_tokens;
- const auto n_embd = hparams.n_embd;
- const auto head_size = hparams.wkv_head_size;
- const auto n_head = n_embd / head_size;
- const auto n_head_kv = hparams.n_head_kv(il);
- const auto kv_head = kv_self->head;
- const auto & layer = model.layers[il];
- bool is_qrwkv = layer.time_mix_first == nullptr;
- ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
- sx = ggml_reshape_2d(ctx0, sx, n_embd, n_tokens);
- cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
- ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_x), cur);
- xxx = ggml_reshape_4d(
- ctx0,
- ggml_tanh(
- ctx0,
- ggml_mul_mat(ctx0, layer.time_mix_w1, xxx)
- ),
- layer.time_mix_w1->ne[1] / 5, 1, 5, n_tokens
- );
- xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2));
- xxx = ggml_mul_mat(
- ctx0,
- ggml_reshape_4d(
- ctx0,
- layer.time_mix_w2,
- layer.time_mix_w2->ne[0], layer.time_mix_w2->ne[1], 1, 5
- ),
- xxx
- );
- ggml_tensor *xw, *xk, *xv, *xr, *xg;
- if (layer.time_mix_lerp_fused) {
- // fusing these weights makes some performance improvement
- sx = ggml_reshape_3d(ctx0, sx, n_embd, 1, n_tokens);
- cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
- xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer.time_mix_lerp_fused), sx), cur);
- xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
- xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
- xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
- xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
- xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
- } else {
- // for backward compatibility
- xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
- xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
- xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
- xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
- xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
- xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer.time_mix_lerp_w), sx), cur);
- xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer.time_mix_lerp_k), sx), cur);
- xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer.time_mix_lerp_v), sx), cur);
- xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer.time_mix_lerp_r), sx), cur);
- xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer.time_mix_lerp_g), sx), cur);
- }
- ggml_tensor * r = build_lora_mm(layer.time_mix_receptance, xr);
- ggml_tensor * k = build_lora_mm(layer.time_mix_key, xk);
- ggml_tensor * v = build_lora_mm(layer.time_mix_value, xv);
- if (layer.time_mix_receptance_b) {
- r = ggml_add(ctx0, r, layer.time_mix_receptance_b);
- }
- if (layer.time_mix_key_b) {
- k = ggml_add(ctx0, k, layer.time_mix_key_b);
- }
- if (layer.time_mix_value_b) {
- v = ggml_add(ctx0, v, layer.time_mix_value_b);
- }
- ggml_tensor * g = build_lora_mm(layer.time_mix_gate, xg);
- if (is_qrwkv) {
- g = ggml_sigmoid(ctx0, g);
- } else {
- g = ggml_silu(ctx0, g);
- }
- if (n_head_kv != 0 && n_head_kv != n_head) {
- GGML_ASSERT(n_head % n_head_kv == 0);
- k = ggml_reshape_4d(ctx0, k, head_size, 1, n_head_kv, n_tokens);
- v = ggml_reshape_4d(ctx0, v, head_size, 1, n_head_kv, n_tokens);
- ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, n_head / n_head_kv, n_head_kv, n_tokens);
- k = ggml_repeat(ctx0, k, tmp);
- v = ggml_repeat(ctx0, v, tmp);
- }
- k = ggml_reshape_3d(ctx0, k, head_size, n_head, n_tokens);
- v = ggml_reshape_3d(ctx0, v, head_size, n_head, n_tokens);
- r = ggml_reshape_3d(ctx0, r, head_size, n_head, n_tokens);
- ggml_tensor * w = ggml_mul_mat(
- ctx0,
- layer.time_mix_decay_w2,
- ggml_tanh(
- ctx0,
- ggml_mul_mat(ctx0, layer.time_mix_decay_w1, xw)
- )
- );
- w = ggml_add(ctx0, w, layer.time_mix_decay);
- w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w)));
- w = ggml_reshape_3d(ctx0, w, head_size, n_head, n_tokens);
- if (is_qrwkv) {
- // k = k * (1 - w)
- k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
- }
- ggml_tensor * wkv_state = build_copy_mask_state(
- gf, kv_self->v_l[il], state_copy, state_mask,
- hparams.n_embd_v_s(), n_seqs);
- ggml_tensor * wkv_output;
- if (is_qrwkv) {
- wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, wkv_state, pow(head_size, -0.5f));
- } else {
- wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer.time_mix_first, w, wkv_state);
- }
- cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
- wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
- ggml_build_forward_expand(
- gf,
- ggml_cpy(
- ctx0,
- wkv_state,
- ggml_view_1d(
- ctx0,
- kv_self->v_l[il],
- hparams.n_embd_v_s() * n_seqs,
- hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self->v_l[il])
- )
- )
- );
- if (!is_qrwkv) {
- // group norm with head_count groups
- cur = ggml_reshape_3d(ctx0, cur, n_embd / n_head, n_head, n_tokens);
- cur = ggml_norm(ctx0, cur, 64e-5f);
- // Convert back to regular vectors.
- cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.time_mix_ln), layer.time_mix_ln_b);
- } else {
- cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
- }
- cur = ggml_mul(ctx0, cur, g);
- cur = build_lora_mm(layer.time_mix_output, cur);
- return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
- }
- };
- struct llm_build_rwkv6 : public llm_build_rwkv6_base {
- llm_build_rwkv6(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) {
- GGML_ASSERT(hparams.token_shift_count == 2);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
- ggml_tensor * state_copy = build_inp_s_copy();
- ggml_tensor * state_mask = build_inp_s_mask();
- const auto n_embd = hparams.n_embd;
- const auto n_seq_tokens = ubatch.n_seq_tokens;
- const auto n_seqs = ubatch.n_seqs;
- for (int il = 0; il < n_layer; ++il) {
- const llama_layer * layer = &model.layers[il];
- inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
- ggml_tensor * token_shift = build_rwkv_token_shift_load(
- gf, state_copy, state_mask, ubatch, il
- );
- ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
- ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
- ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
- cb(att_norm, "attn_norm", il);
- ggml_tensor * x_prev = ggml_concat(
- ctx0,
- att_shift,
- ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
- 1
- );
- cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
- ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
- cb(ffn_norm, "ffn_norm", il);
- x_prev = ggml_concat(
- ctx0,
- ffn_shift,
- ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0),
- 1
- );
- token_shift = ggml_concat(ctx0,
- ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)),
- ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
- 1
- );
- ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
- ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
- ffn_norm = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens), inp_out_ids);
- x_prev = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens), inp_out_ids);
- cur = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens), inp_out_ids);
- }
- cur = build_rwkv6_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6);
- cur = ggml_add(ctx0, cur, ffn_inp);
- if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
- cur = ggml_scale(ctx0, cur, 0.5F);
- }
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
- struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
- llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) {
- GGML_ASSERT(n_embd == hparams.n_embd_k_s());
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- ggml_tensor * state_copy = build_inp_s_copy();
- ggml_tensor * state_mask = build_inp_s_mask();
- const auto n_embd = hparams.n_embd;
- const auto n_seq_tokens = ubatch.n_seq_tokens;
- const auto n_seqs = ubatch.n_seqs;
- for (int il = 0; il < n_layer; ++il) {
- const llama_layer * layer = &model.layers[il];
- inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
- ggml_tensor * token_shift = build_rwkv_token_shift_load(
- gf, state_copy, state_mask, ubatch, il
- );
- ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
- cb(att_norm, "attn_norm", il);
- ggml_tensor * x_prev = ggml_concat(
- ctx0,
- token_shift,
- ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
- 1
- );
- cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
- token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
- ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens), inp_out_ids);
- ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
- }
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_rwkv7_base : public llm_graph_context {
- const llama_model & model;
- llm_build_rwkv7_base(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
- }
- ggml_tensor * build_rwkv7_channel_mix(
- const llama_layer * layer,
- ggml_tensor * cur,
- ggml_tensor * x_prev,
- llm_arch arch) const {
- ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
- switch (arch) {
- case LLM_ARCH_RWKV7:
- {
- ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
- ggml_tensor * k = ggml_sqr(
- ctx0,
- ggml_relu(
- ctx0,
- build_lora_mm(layer->channel_mix_key, xk)
- )
- );
- cur = build_lora_mm(layer->channel_mix_value, k);
- } break;
- default:
- GGML_ABORT("fatal error");
- }
- return cur;
- }
- ggml_tensor * build_rwkv7_time_mix(
- ggml_cgraph * gf,
- ggml_tensor * cur,
- ggml_tensor * x_prev,
- ggml_tensor * state_copy,
- ggml_tensor * state_mask,
- ggml_tensor *& first_layer_value,
- const llama_ubatch & ubatch,
- int il) const {
- const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
- const auto n_tokens = ubatch.n_tokens;
- const auto n_seqs = ubatch.n_seqs;
- const auto n_embd = hparams.n_embd;
- const auto head_size = hparams.wkv_head_size;
- const auto head_count = n_embd / head_size;
- const auto n_seq_tokens = ubatch.n_seq_tokens;
- const auto kv_head = kv_self->head;
- const auto & layer = model.layers[il];
- bool has_gating = layer.time_mix_g1 && layer.time_mix_g2;
- ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
- ggml_tensor * dummy = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_embd, n_seq_tokens, n_seqs, has_gating ? 6 : 5);
- sx = ggml_repeat(ctx0, sx, dummy);
- ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_fused), cur);
- ggml_tensor * xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
- ggml_tensor * xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
- ggml_tensor * xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
- ggml_tensor * xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
- ggml_tensor * xa = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
- ggml_tensor * xg = has_gating ? ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 5 * sizeof(float)) : nullptr;
- ggml_tensor * r = build_lora_mm(layer.time_mix_receptance, xr);
- ggml_tensor * w = ggml_add(
- ctx0,
- ggml_mul_mat(ctx0, layer.time_mix_w2, ggml_tanh(ctx0, ggml_mul_mat(ctx0, layer.time_mix_w1, xw))),
- layer.time_mix_w0
- );
- w = ggml_exp(ctx0, ggml_scale(ctx0, ggml_sigmoid(ctx0, w), -0.606531));
- ggml_tensor * k = build_lora_mm(layer.time_mix_key, xk);
- ggml_tensor * v = build_lora_mm(layer.time_mix_value, xv);
- if (first_layer_value == nullptr) {
- first_layer_value = v;
- } else {
- // Add the first layer value as a residual connection.
- v = ggml_add(ctx0, v,
- ggml_mul(ctx0,
- ggml_sub(ctx0, first_layer_value, v),
- ggml_sigmoid(ctx0, ggml_add(ctx0,
- ggml_mul_mat(ctx0, layer.time_mix_v2, ggml_mul_mat(ctx0, layer.time_mix_v1, xv)),
- layer.time_mix_v0
- )
- )
- )
- );
- }
- ggml_tensor * g = nullptr;
- if (layer.time_mix_g1 && layer.time_mix_g2) {
- g = ggml_mul_mat(ctx0, layer.time_mix_g2, ggml_sigmoid(ctx0, ggml_mul_mat(ctx0, layer.time_mix_g1, xg)));
- }
- ggml_tensor * a = ggml_sigmoid(ctx0,
- ggml_add(
- ctx0,
- ggml_mul_mat(ctx0, layer.time_mix_a2, ggml_mul_mat(ctx0, layer.time_mix_a1, xa)),
- layer.time_mix_a0
- )
- );
- ggml_tensor * kk = ggml_reshape_3d(ctx0, ggml_mul(ctx0, k, layer.time_mix_k_k), head_size, head_count, n_tokens);
- kk = ggml_l2_norm(ctx0, kk, 1e-12);
- ggml_tensor * ka = ggml_mul(ctx0, k, layer.time_mix_k_a);
- k = ggml_add(ctx0, k, ggml_sub(ctx0, ggml_mul(ctx0, a, ka), ka));
- r = ggml_reshape_3d(ctx0, r, head_size, head_count, n_tokens);
- w = ggml_reshape_3d(ctx0, w, head_size, head_count, n_tokens);
- k = ggml_reshape_3d(ctx0, k, head_size, head_count, n_tokens);
- v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
- a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
- ggml_tensor * wkv_state = build_copy_mask_state(
- gf, kv_self->v_l[il], state_copy, state_mask,
- hparams.n_embd_v_s(), n_seqs);
- ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
- cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
- wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
- ggml_build_forward_expand(
- gf,
- ggml_cpy(
- ctx0,
- wkv_state,
- ggml_view_1d(
- ctx0,
- kv_self->v_l[il],
- hparams.n_embd_v_s() * n_seqs,
- hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self->v_l[il])
- )
- )
- );
- if (layer.time_mix_ln && layer.time_mix_ln_b) {
- // group norm with head_count groups
- cur = ggml_reshape_3d(ctx0, cur, n_embd / head_count, head_count, n_tokens);
- cur = ggml_norm(ctx0, cur, 64e-5f);
- // Convert back to regular vectors.
- cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.time_mix_ln), layer.time_mix_ln_b);
- } else {
- cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
- }
- ggml_tensor * rk = ggml_sum_rows(ctx0,
- ggml_mul(ctx0, ggml_mul(ctx0, k, r), ggml_reshape_2d(ctx0, layer.time_mix_r_k, head_size, head_count)));
- cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, ggml_mul(ctx0, v, rk), n_embd, n_tokens));
- if (has_gating) {
- cur = ggml_mul(ctx0, cur, g);
- }
- cur = build_lora_mm(layer.time_mix_output, cur);
- return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
- }
- };
- struct llm_build_rwkv7 : public llm_build_rwkv7_base {
- llm_build_rwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) {
- GGML_ASSERT(hparams.token_shift_count == 2);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- ggml_tensor * v_first = nullptr;
- inpL = build_inp_embd(model.tok_embd);
- inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
- ggml_tensor * state_copy = build_inp_s_copy();
- ggml_tensor * state_mask = build_inp_s_mask();
- const auto n_embd = hparams.n_embd;
- const auto n_seq_tokens = ubatch.n_seq_tokens;
- const auto n_seqs = ubatch.n_seqs;
- for (int il = 0; il < n_layer; ++il) {
- const llama_layer * layer = &model.layers[il];
- inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
- ggml_tensor * token_shift = build_rwkv_token_shift_load(
- gf, state_copy, state_mask, ubatch, il
- );
- ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
- ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
- ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
- cb(att_norm, "attn_norm", il);
- ggml_tensor * x_prev = ggml_concat(
- ctx0,
- att_shift,
- ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
- 1
- );
- cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il);
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
- ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
- cb(ffn_norm, "ffn_norm", il);
- x_prev = ggml_concat(
- ctx0,
- ffn_shift,
- ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0),
- 1
- );
- token_shift = ggml_concat(ctx0,
- ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)),
- ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
- 1
- );
- ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
- ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
- ffn_norm = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens), inp_out_ids);
- x_prev = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens), inp_out_ids);
- }
- cur = build_rwkv7_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV7);
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_arwkv7 : public llm_build_rwkv7_base {
- llm_build_arwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) {
- GGML_ASSERT(n_embd == hparams.n_embd_k_s());
- ggml_tensor * cur;
- ggml_tensor * inpL;
- ggml_tensor * v_first = nullptr;
- inpL = build_inp_embd(model.tok_embd);
- ggml_tensor * state_copy = build_inp_s_copy();
- ggml_tensor * state_mask = build_inp_s_mask();
- const auto n_embd = hparams.n_embd;
- const auto n_seq_tokens = ubatch.n_seq_tokens;
- const auto n_seqs = ubatch.n_seqs;
- for (int il = 0; il < n_layer; ++il) {
- const llama_layer * layer = &model.layers[il];
- inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
- ggml_tensor * token_shift = build_rwkv_token_shift_load(
- gf, state_copy, state_mask, ubatch, il
- );
- ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
- cb(att_norm, "attn_norm", il);
- ggml_tensor * x_prev = ggml_concat(
- ctx0,
- token_shift,
- ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
- 1
- );
- cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il);
- token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
- ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
- cb(ffn_inp, "ffn_inp", il);
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens), inp_out_ids);
- ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
- }
- // feed-forward network
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- // ref: https://github.com/facebookresearch/chameleon
- // based on the original build_llama() function, changes:
- // * qk-norm
- // * swin-norm
- // * removed bias
- // * removed MoE
- struct llm_build_chameleon : public llm_graph_context {
- llm_build_chameleon(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const int64_t n_embd_head = hparams.n_embd_head_v;
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // norm
- if (hparams.swin_norm) {
- cur = inpL;
- } else {
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- }
- // self-attention
- {
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].attn_q_norm) {
- Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
- ggml_element_size(Qcur) * n_embd_head,
- ggml_element_size(Qcur) * n_embd_head * n_head,
- 0);
- cb(Qcur, "Qcur", il);
- Qcur = build_norm(Qcur,
- model.layers[il].attn_q_norm,
- model.layers[il].attn_q_norm_b,
- LLM_NORM, il);
- cb(Qcur, "Qcur", il);
- }
- if (model.layers[il].attn_k_norm) {
- Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
- ggml_element_size(Kcur) * n_embd_head,
- ggml_element_size(Kcur) * n_embd_head * n_head_kv,
- 0);
- cb(Kcur, "Kcur", il);
- Kcur = build_norm(Kcur,
- model.layers[il].attn_k_norm,
- model.layers[il].attn_k_norm_b,
- LLM_NORM, il);
- cb(Kcur, "Kcur", il);
- }
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, nullptr,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
- if (hparams.swin_norm) {
- cur = build_norm(cur,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- }
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- // feed-forward network
- if (!hparams.swin_norm) {
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- }
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(cur, "ffn_out", il);
- if (hparams.swin_norm) {
- cur = build_norm(cur,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output_with_img_logits", -1);
- // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
- // Needs to be removed once image outputs are supported.
- int img_token_end_idx = 8196;
- int img_token_start_idx = 4;
- int num_img_tokens = img_token_end_idx - img_token_start_idx;
- // creates 1d tensor of size num_img_tokens and values -FLT_MAX,
- // which ensures that text token values are always at least larger than image token values
- ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
- img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
- cb(img_logits, "img_logits", -1);
- cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_wavtokenizer_dec : public llm_graph_context {
- llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
- cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
- cur = ggml_add(ctx0, cur, model.conv1d_b);
- // posnet
- for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
- const auto & layer = model.layers[il].posnet;
- inpL = cur;
- switch (il) {
- case 0:
- case 1:
- case 3:
- case 4:
- {
- cur = build_norm(cur,
- layer.norm1,
- layer.norm1_b,
- LLM_NORM_GROUP, 0);
- cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
- cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
- cur = ggml_add(ctx0, cur, layer.conv1_b);
- cur = build_norm(cur,
- layer.norm2,
- layer.norm2_b,
- LLM_NORM_GROUP, 0);
- cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
- cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
- cur = ggml_add(ctx0, cur, layer.conv2_b);
- cur = ggml_add(ctx0, cur, inpL);
- } break;
- case 2:
- {
- cur = build_norm(cur,
- layer.attn_norm,
- layer.attn_norm_b,
- LLM_NORM_GROUP, 0);
- ggml_tensor * q;
- ggml_tensor * k;
- ggml_tensor * v;
- q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
- k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
- v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
- q = ggml_add(ctx0, q, layer.attn_q_b);
- k = ggml_add(ctx0, k, layer.attn_k_b);
- v = ggml_add(ctx0, v, layer.attn_v_b);
- q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
- k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
- ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
- kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
- cur = ggml_mul_mat(ctx0, kq, v);
- cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
- cur = ggml_add(ctx0, cur, layer.attn_o_b);
- cur = ggml_add(ctx0, cur, inpL);
- } break;
- case 5:
- {
- cur = build_norm(cur,
- layer.norm,
- layer.norm_b,
- LLM_NORM_GROUP, 0);
- } break;
- default: GGML_ABORT("unknown posnet layer");
- };
- }
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
- cur = build_norm(cur,
- model.tok_norm,
- model.tok_norm_b,
- LLM_NORM, -1);
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
- inpL = cur;
- // convnext
- for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
- const auto & layer = model.layers[il].convnext;
- cur = inpL;
- cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
- cur = ggml_add(ctx0, cur, layer.dw_b);
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
- cur = build_norm(cur,
- layer.norm,
- layer.norm_b,
- LLM_NORM, -1);
- cur = build_ffn(cur,
- layer.pw1, layer.pw1_b, NULL,
- NULL, NULL, NULL,
- layer.pw2, layer.pw2_b, NULL,
- NULL,
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
- cur = ggml_mul(ctx0, cur, layer.gamma);
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
- inpL = ggml_add(ctx0, cur, inpL);
- }
- cur = inpL;
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
- cur = build_norm(cur,
- model.output_norm,
- model.output_norm_b,
- LLM_NORM, -1);
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cur = ggml_add(ctx0, cur, model.output_b);
- cb(cur, "result_embd", -1);
- res->t_embd = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_plm : public llm_graph_context {
- llm_build_plm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
- const uint32_t n_embd_head_qk_rope = hparams.n_rot;
- const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
- const uint32_t kv_lora_rank = hparams.n_lora_kv;
- ggml_tensor * cur;
- ggml_tensor * inpL;
- // {n_embd, n_tokens}
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self_attention
- {
- ggml_tensor * q = NULL;
- q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
- cb(q, "q", il);
- // split into {n_head * n_embd_head_qk_nope, n_tokens}
- ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
- ggml_row_size(q->type, hparams.n_embd_head_k),
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
- 0);
- cb(q_nope, "q_nope", il);
- // and {n_head * n_embd_head_qk_rope, n_tokens}
- ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
- ggml_row_size(q->type, hparams.n_embd_head_k),
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
- ggml_row_size(q->type, n_embd_head_qk_nope));
- cb(q_pe, "q_pe", il);
- // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
- ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
- cb(kv_pe_compresseed, "kv_pe_compresseed", il);
- // split into {kv_lora_rank, n_tokens}
- ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
- kv_pe_compresseed->nb[1],
- 0);
- cb(kv_compressed, "kv_compressed", il);
- // and {n_embd_head_qk_rope, n_tokens}
- ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
- kv_pe_compresseed->nb[1],
- kv_pe_compresseed->nb[1],
- ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
- cb(k_pe, "k_pe", il);
- kv_compressed = build_norm(kv_compressed,
- model.layers[il].attn_kv_a_norm, NULL,
- LLM_NORM_RMS, il);
- cb(kv_compressed, "kv_compressed", il);
- // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
- ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
- cb(kv, "kv", il);
- // split into {n_head * n_embd_head_qk_nope, n_tokens}
- ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
- ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
- ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
- 0);
- cb(k_nope, "k_nope", il);
- // and {n_head * n_embd_head_v, n_tokens}
- ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
- ggml_row_size(kv->type, (n_embd_head_qk_nope)));
- cb(v_states, "v_states", il);
- v_states = ggml_cont(ctx0, v_states);
- cb(v_states, "v_states", il);
- v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
- ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
- 0);
- cb(v_states, "v_states", il);
- q_pe = ggml_rope_ext(
- ctx0, q_pe, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(q_pe, "q_pe", il);
- // shared RoPE key
- k_pe = ggml_rope_ext(
- ctx0, k_pe, inp_pos, nullptr,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(k_pe, "k_pe", il);
- ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
- cb(q_states, "q_states", il);
- ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
- cb(k_states, "k_states", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, NULL,
- q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
- NULL, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
- cb(cur, "ffn_out", il);
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- struct llm_build_bailingmoe : public llm_graph_context {
- llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
- ggml_tensor * cur;
- ggml_tensor * inpL;
- inpL = build_inp_embd(model.tok_embd);
- // inp_pos - contains the positions
- ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv_unified();
- for (int il = 0; il < n_layer; ++il) {
- ggml_tensor * inpSA = inpL;
- // norm
- cur = build_norm(inpL,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "attn_norm", il);
- // self-attention
- {
- // rope freq factors for llama3; may return nullptr for llama2 and other models
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
- // compute Q and K and RoPE them
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
- if (model.layers[il].bq) {
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
- cb(Qcur, "Qcur", il);
- }
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
- if (model.layers[il].bk) {
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
- cb(Kcur, "Kcur", il);
- }
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
- if (model.layers[il].bv) {
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
- cb(Vcur, "Vcur", il);
- }
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
- Qcur = ggml_rope_ext(
- ctx0, Qcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- Kcur = ggml_rope_ext(
- ctx0, Kcur, inp_pos, rope_factors,
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
- ext_factor, attn_factor, beta_fast, beta_slow
- );
- cb(Qcur, "Qcur", il);
- cb(Kcur, "Kcur", il);
- cb(Vcur, "Vcur", il);
- cur = build_attn(inp_attn, gf,
- model.layers[il].wo, model.layers[il].bo,
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
- }
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- ggml_tensor * inp_out_ids = build_inp_out_ids();
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
- cur = build_norm(ffn_inp,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, il);
- cb(cur, "ffn_norm", il);
- ggml_tensor * moe_out =
- build_moe_ffn(cur,
- model.layers[il].ffn_gate_inp,
- model.layers[il].ffn_up_exps,
- model.layers[il].ffn_gate_exps,
- model.layers[il].ffn_down_exps,
- nullptr,
- n_expert, n_expert_used,
- LLM_FFN_SILU, hparams.expert_weights_norm,
- false, hparams.expert_weights_scale,
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
- il);
- cb(moe_out, "ffn_moe_out", il);
- // FFN shared expert
- {
- ggml_tensor * ffn_shexp = build_ffn(cur,
- model.layers[il].ffn_up_shexp, NULL, NULL,
- model.layers[il].ffn_gate_shexp, NULL, NULL,
- model.layers[il].ffn_down_shexp, NULL, NULL,
- NULL,
- LLM_FFN_SILU, LLM_FFN_PAR, il);
- cb(ffn_shexp, "ffn_shexp", il);
- cur = ggml_add(ctx0, moe_out, ffn_shexp);
- cb(cur, "ffn_out", il);
- }
- cur = ggml_add(ctx0, cur, ffn_inp);
- cur = build_cvec(cur, il);
- cb(cur, "l_out", il);
- // input for next layer
- inpL = cur;
- }
- cur = inpL;
- cur = build_norm(cur,
- model.output_norm, NULL,
- LLM_NORM_RMS, -1);
- cb(cur, "result_norm", -1);
- res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
- ggml_build_forward_expand(gf, cur);
- }
- };
- llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
- llama_memory_i * res;
- switch (arch) {
- case LLM_ARCH_MAMBA:
- case LLM_ARCH_RWKV6:
- case LLM_ARCH_RWKV6QWEN2:
- case LLM_ARCH_RWKV7:
- case LLM_ARCH_ARWKV7:
- {
- res = new llama_kv_cache_recurrent(
- *this,
- GGML_TYPE_F32,
- GGML_TYPE_F32,
- cparams.offload_kqv,
- std::max((uint32_t) 1, cparams.n_seq_max));
- } break;
- default:
- {
- const auto padding = llama_kv_cache_unified::get_padding(cparams);
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
- LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
- res = new llama_kv_cache_unified(
- *this,
- params.type_k,
- params.type_v,
- !cparams.flash_attn,
- cparams.offload_kqv,
- cparams.n_ctx,
- padding);
- }
- }
- return res;
- }
- llm_graph_result_ptr llama_model::build_graph(
- const llm_graph_params & params,
- ggml_cgraph * gf,
- llm_graph_type type) const {
- std::unique_ptr<llm_graph_context> llm;
- switch (arch) {
- case LLM_ARCH_LLAMA:
- case LLM_ARCH_LLAMA4:
- case LLM_ARCH_MINICPM:
- case LLM_ARCH_GRANITE:
- case LLM_ARCH_GRANITE_MOE:
- {
- llm = std::make_unique<llm_build_llama>(*this, params, gf);
- } break;
- case LLM_ARCH_DECI:
- {
- llm = std::make_unique<llm_build_deci>(*this, params, gf);
- } break;
- case LLM_ARCH_BAICHUAN:
- {
- llm = std::make_unique<llm_build_baichuan>(*this, params, gf);
- } break;
- case LLM_ARCH_FALCON:
- {
- llm = std::make_unique<llm_build_falcon>(*this, params, gf);
- } break;
- case LLM_ARCH_GROK:
- {
- llm = std::make_unique<llm_build_grok>(*this, params, gf);
- } break;
- case LLM_ARCH_STARCODER:
- {
- llm = std::make_unique<llm_build_starcoder>(*this, params, gf);
- } break;
- case LLM_ARCH_REFACT:
- {
- llm = std::make_unique<llm_build_refact>(*this, params, gf);
- } break;
- case LLM_ARCH_BERT:
- case LLM_ARCH_JINA_BERT_V2:
- case LLM_ARCH_NOMIC_BERT:
- case LLM_ARCH_NOMIC_BERT_MOE:
- {
- llm = std::make_unique<llm_build_bert>(*this, params, gf);
- } break;
- case LLM_ARCH_BLOOM:
- {
- llm = std::make_unique<llm_build_bloom>(*this, params, gf);
- } break;
- case LLM_ARCH_MPT:
- {
- llm = std::make_unique<llm_build_mpt>(*this, params, gf);
- } break;
- case LLM_ARCH_STABLELM:
- {
- llm = std::make_unique<llm_build_stablelm>(*this, params, gf);
- } break;
- case LLM_ARCH_QWEN:
- {
- llm = std::make_unique<llm_build_qwen>(*this, params, gf);
- } break;
- case LLM_ARCH_QWEN2:
- {
- llm = std::make_unique<llm_build_qwen2>(*this, params, gf);
- } break;
- case LLM_ARCH_QWEN2VL:
- {
- llm = std::make_unique<llm_build_qwen2vl>(*this, params, gf);
- } break;
- case LLM_ARCH_QWEN2MOE:
- {
- llm = std::make_unique<llm_build_qwen2moe>(*this, params, gf);
- } break;
- case LLM_ARCH_QWEN3:
- {
- llm = std::make_unique<llm_build_qwen3>(*this, params, gf);
- } break;
- case LLM_ARCH_QWEN3MOE:
- {
- llm = std::make_unique<llm_build_qwen3moe>(*this, params, gf);
- } break;
- case LLM_ARCH_PHI2:
- {
- llm = std::make_unique<llm_build_phi2>(*this, params, gf);
- } break;
- case LLM_ARCH_PHI3:
- case LLM_ARCH_PHIMOE:
- {
- llm = std::make_unique<llm_build_phi3>(*this, params, gf);
- } break;
- case LLM_ARCH_PLAMO:
- {
- llm = std::make_unique<llm_build_plamo>(*this, params, gf);
- } break;
- case LLM_ARCH_GPT2:
- {
- llm = std::make_unique<llm_build_gpt2>(*this, params, gf);
- } break;
- case LLM_ARCH_CODESHELL:
- {
- llm = std::make_unique<llm_build_codeshell>(*this, params, gf);
- } break;
- case LLM_ARCH_ORION:
- {
- llm = std::make_unique<llm_build_orion>(*this, params, gf);
- } break;
- case LLM_ARCH_INTERNLM2:
- {
- llm = std::make_unique<llm_build_internlm2>(*this, params, gf);
- } break;
- case LLM_ARCH_MINICPM3:
- {
- llm = std::make_unique<llm_build_minicpm3>(*this, params, gf);
- } break;
- case LLM_ARCH_GEMMA:
- {
- llm = std::make_unique<llm_build_gemma>(*this, params, gf);
- } break;
- case LLM_ARCH_GEMMA2:
- {
- llm = std::make_unique<llm_build_gemma2>(*this, params, gf);
- } break;
- case LLM_ARCH_GEMMA3:
- {
- llm = std::make_unique<llm_build_gemma3>(*this, params, gf);
- } break;
- case LLM_ARCH_STARCODER2:
- {
- llm = std::make_unique<llm_build_starcoder2>(*this, params, gf);
- } break;
- case LLM_ARCH_MAMBA:
- {
- llm = std::make_unique<llm_build_mamba>(*this, params, gf);
- } break;
- case LLM_ARCH_XVERSE:
- {
- llm = std::make_unique<llm_build_xverse>(*this, params, gf);
- } break;
- case LLM_ARCH_COMMAND_R:
- {
- llm = std::make_unique<llm_build_command_r>(*this, params, gf);
- } break;
- case LLM_ARCH_COHERE2:
- {
- llm = std::make_unique<llm_build_cohere2>(*this, params, gf);
- } break;
- case LLM_ARCH_DBRX:
- {
- llm = std::make_unique<llm_build_dbrx>(*this, params, gf);
- } break;
- case LLM_ARCH_OLMO:
- {
- llm = std::make_unique<llm_build_olmo>(*this, params, gf);
- } break;
- case LLM_ARCH_OLMO2:
- {
- llm = std::make_unique<llm_build_olmo2>(*this, params, gf);
- } break;
- case LLM_ARCH_OLMOE:
- {
- llm = std::make_unique<llm_build_olmoe>(*this, params, gf);
- } break;
- case LLM_ARCH_OPENELM:
- {
- llm = std::make_unique<llm_build_openelm>(*this, params, gf);
- } break;
- case LLM_ARCH_GPTNEOX:
- {
- llm = std::make_unique<llm_build_gptneox>(*this, params, gf);
- } break;
- case LLM_ARCH_ARCTIC:
- {
- llm = std::make_unique<llm_build_arctic>(*this, params, gf);
- } break;
- case LLM_ARCH_DEEPSEEK:
- {
- llm = std::make_unique<llm_build_deepseek>(*this, params, gf);
- } break;
- case LLM_ARCH_DEEPSEEK2:
- {
- llm = std::make_unique<llm_build_deepseek2>(*this, params, gf);
- } break;
- case LLM_ARCH_CHATGLM:
- {
- llm = std::make_unique<llm_build_chatglm>(*this, params, gf);
- } break;
- case LLM_ARCH_GLM4:
- {
- llm = std::make_unique<llm_build_glm4>(*this, params, gf);
- } break;
- case LLM_ARCH_BITNET:
- {
- llm = std::make_unique<llm_build_bitnet>(*this, params, gf);
- } break;
- case LLM_ARCH_T5:
- {
- switch (type) {
- case LLM_GRAPH_TYPE_ENCODER:
- llm = std::make_unique<llm_build_t5_enc>(*this, params, gf);
- break;
- case LLM_GRAPH_TYPE_DEFAULT:
- case LLM_GRAPH_TYPE_DECODER:
- llm = std::make_unique<llm_build_t5_dec>(*this, params, gf);
- break;
- default:
- GGML_ABORT("invalid graph type");
- };
- } break;
- case LLM_ARCH_T5ENCODER:
- {
- llm = std::make_unique<llm_build_t5_enc>(*this, params, gf);
- }
- break;
- case LLM_ARCH_JAIS:
- {
- llm = std::make_unique<llm_build_jais>(*this, params, gf);
- } break;
- case LLM_ARCH_NEMOTRON:
- {
- llm = std::make_unique<llm_build_nemotron>(*this, params, gf);
- } break;
- case LLM_ARCH_EXAONE:
- {
- llm = std::make_unique<llm_build_exaone>(*this, params, gf);
- } break;
- case LLM_ARCH_RWKV6:
- {
- llm = std::make_unique<llm_build_rwkv6>(*this, params, gf);
- } break;
- case LLM_ARCH_RWKV6QWEN2:
- {
- llm = std::make_unique<llm_build_rwkv6qwen2>(*this, params, gf);
- } break;
- case LLM_ARCH_RWKV7:
- {
- llm = std::make_unique<llm_build_rwkv7>(*this, params, gf);
- } break;
- case LLM_ARCH_ARWKV7:
- {
- llm = std::make_unique<llm_build_arwkv7>(*this, params, gf);
- } break;
- case LLM_ARCH_CHAMELEON:
- {
- llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
- } break;
- case LLM_ARCH_WAVTOKENIZER_DEC:
- {
- llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
- } break;
- case LLM_ARCH_PLM:
- {
- llm = std::make_unique<llm_build_plm>(*this, params, gf);
- } break;
- case LLM_ARCH_BAILINGMOE:
- {
- llm = std::make_unique<llm_build_bailingmoe>(*this, params, gf);
- } break;
- default:
- GGML_ABORT("fatal error");
- }
- // add on pooling layer
- llm->build_pooling(gf, cls, cls_b, cls_out, cls_out_b);
- return std::move(llm->res);
- }
- //
- // interface implementation
- //
- llama_model_params llama_model_default_params() {
- llama_model_params result = {
- /*.devices =*/ nullptr,
- /*.tensor_buft_overrides =*/ nullptr,
- /*.n_gpu_layers =*/ 0,
- /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
- /*.main_gpu =*/ 0,
- /*.tensor_split =*/ nullptr,
- /*.progress_callback =*/ nullptr,
- /*.progress_callback_user_data =*/ nullptr,
- /*.kv_overrides =*/ nullptr,
- /*.vocab_only =*/ false,
- /*.use_mmap =*/ true,
- /*.use_mlock =*/ false,
- /*.check_tensors =*/ false,
- };
- #ifdef GGML_USE_METAL
- // note: we usually have plenty of VRAM, so by default offload all layers to the GPU
- result.n_gpu_layers = 999;
- #endif
- return result;
- }
- const llama_vocab * llama_model_get_vocab(const llama_model * model) {
- return &model->vocab;
- }
- void llama_free_model(llama_model * model) {
- llama_model_free(model);
- }
- void llama_model_free(llama_model * model) {
- delete model;
- }
- int32_t llama_model_n_ctx_train(const llama_model * model) {
- return model->hparams.n_ctx_train;
- }
- int32_t llama_model_n_embd(const llama_model * model) {
- return model->hparams.n_embd;
- }
- int32_t llama_model_n_layer(const llama_model * model) {
- return model->hparams.n_layer;
- }
- int32_t llama_model_n_head(const llama_model * model) {
- return model->hparams.n_head();
- }
- int32_t llama_model_n_head_kv(const llama_model * model) {
- return model->hparams.n_head_kv();
- }
- // deprecated
- int32_t llama_n_ctx_train(const llama_model * model) {
- return llama_model_n_ctx_train(model);
- }
- // deprecated
- int32_t llama_n_embd(const llama_model * model) {
- return llama_model_n_embd(model);
- }
- // deprecated
- int32_t llama_n_layer(const llama_model * model) {
- return llama_model_n_layer(model);
- }
- // deprecated
- int32_t llama_n_head(const llama_model * model) {
- return llama_model_n_head(model);
- }
- llama_rope_type llama_model_rope_type(const llama_model * model) {
- switch (model->arch) {
- // these models do not use RoPE
- case LLM_ARCH_GPT2:
- case LLM_ARCH_GPTJ:
- case LLM_ARCH_MPT:
- case LLM_ARCH_REFACT:
- case LLM_ARCH_BLOOM:
- case LLM_ARCH_MAMBA:
- case LLM_ARCH_JINA_BERT_V2:
- case LLM_ARCH_T5:
- case LLM_ARCH_T5ENCODER:
- case LLM_ARCH_JAIS:
- case LLM_ARCH_RWKV6:
- case LLM_ARCH_RWKV6QWEN2:
- case LLM_ARCH_RWKV7:
- case LLM_ARCH_ARWKV7:
- case LLM_ARCH_WAVTOKENIZER_DEC:
- return LLAMA_ROPE_TYPE_NONE;
- // use what we call a normal RoPE, operating on pairs of consecutive head values
- case LLM_ARCH_LLAMA:
- case LLM_ARCH_LLAMA4:
- case LLM_ARCH_DECI:
- case LLM_ARCH_BAICHUAN:
- case LLM_ARCH_STARCODER:
- case LLM_ARCH_INTERNLM2:
- case LLM_ARCH_MINICPM:
- case LLM_ARCH_XVERSE:
- case LLM_ARCH_COMMAND_R:
- case LLM_ARCH_COHERE2:
- case LLM_ARCH_OLMO:
- case LLM_ARCH_ARCTIC:
- case LLM_ARCH_DEEPSEEK:
- case LLM_ARCH_DEEPSEEK2:
- case LLM_ARCH_PLM:
- case LLM_ARCH_CHATGLM:
- case LLM_ARCH_GLM4:
- case LLM_ARCH_GRANITE:
- case LLM_ARCH_GRANITE_MOE:
- case LLM_ARCH_CHAMELEON:
- case LLM_ARCH_BAILINGMOE:
- return LLAMA_ROPE_TYPE_NORM;
- // the pairs of head values are offset by n_rot/2
- case LLM_ARCH_FALCON:
- case LLM_ARCH_GROK:
- case LLM_ARCH_DBRX:
- case LLM_ARCH_BERT:
- case LLM_ARCH_NOMIC_BERT:
- case LLM_ARCH_NOMIC_BERT_MOE:
- case LLM_ARCH_STABLELM:
- case LLM_ARCH_BITNET:
- case LLM_ARCH_QWEN:
- case LLM_ARCH_QWEN2:
- case LLM_ARCH_QWEN2MOE:
- case LLM_ARCH_QWEN3:
- case LLM_ARCH_QWEN3MOE:
- case LLM_ARCH_OLMO2:
- case LLM_ARCH_OLMOE:
- case LLM_ARCH_PHI2:
- case LLM_ARCH_PHI3:
- case LLM_ARCH_PHIMOE:
- case LLM_ARCH_PLAMO:
- case LLM_ARCH_GEMMA:
- case LLM_ARCH_GEMMA2:
- case LLM_ARCH_GEMMA3:
- case LLM_ARCH_STARCODER2:
- case LLM_ARCH_OPENELM:
- case LLM_ARCH_GPTNEOX:
- case LLM_ARCH_CODESHELL:
- case LLM_ARCH_ORION:
- case LLM_ARCH_NEMOTRON:
- case LLM_ARCH_EXAONE:
- case LLM_ARCH_MINICPM3:
- return LLAMA_ROPE_TYPE_NEOX;
- case LLM_ARCH_QWEN2VL:
- return LLAMA_ROPE_TYPE_MROPE;
- // all model arches should be listed explicitly here
- case LLM_ARCH_UNKNOWN:
- GGML_ABORT("unknown architecture");
- }
- return LLAMA_ROPE_TYPE_NONE;
- }
- float llama_model_rope_freq_scale_train(const llama_model * model) {
- return model->hparams.rope_freq_scale_train;
- }
- int32_t llama_model_meta_val_str(const llama_model * model, const char * key, char * buf, size_t buf_size) {
- const auto & it = model->gguf_kv.find(key);
- if (it == model->gguf_kv.end()) {
- if (buf_size > 0) {
- buf[0] = '\0';
- }
- return -1;
- }
- return snprintf(buf, buf_size, "%s", it->second.c_str());
- }
- int32_t llama_model_meta_count(const llama_model * model) {
- return (int)model->gguf_kv.size();
- }
- int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
- if (i < 0 || i >= (int)model->gguf_kv.size()) {
- if (buf_size > 0) {
- buf[0] = '\0';
- }
- return -1;
- }
- auto it = model->gguf_kv.begin();
- std::advance(it, i);
- return snprintf(buf, buf_size, "%s", it->first.c_str());
- }
- int32_t llama_model_meta_val_str_by_index(const llama_model * model, int32_t i, char * buf, size_t buf_size) {
- if (i < 0 || i >= (int)model->gguf_kv.size()) {
- if (buf_size > 0) {
- buf[0] = '\0';
- }
- return -1;
- }
- auto it = model->gguf_kv.begin();
- std::advance(it, i);
- return snprintf(buf, buf_size, "%s", it->second.c_str());
- }
- int32_t llama_model_desc(const llama_model * model, char * buf, size_t buf_size) {
- return snprintf(buf, buf_size, "%s", model->desc().c_str());
- }
- uint64_t llama_model_size(const llama_model * model) {
- return model->size();
- }
- const char * llama_model_chat_template(const llama_model * model, const char * name) {
- const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE_N)
- : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
- const auto & it = model->gguf_kv.find(key);
- if (it == model->gguf_kv.end()) {
- return nullptr;
- }
- return it->second.c_str();
- }
- uint64_t llama_model_n_params(const llama_model * model) {
- return model->n_elements();
- }
- bool llama_model_has_encoder(const llama_model * model) {
- switch (model->arch) {
- case LLM_ARCH_T5: return true;
- case LLM_ARCH_T5ENCODER: return true;
- default: return false;
- }
- }
- bool llama_model_has_decoder(const llama_model * model) {
- switch (model->arch) {
- case LLM_ARCH_T5ENCODER: return false;
- default: return true;
- }
- }
- llama_token llama_model_decoder_start_token(const llama_model * model) {
- return model->hparams.dec_start_token_id;
- }
- bool llama_model_is_recurrent(const llama_model * model) {
- switch (model->arch) {
- case LLM_ARCH_MAMBA: return true;
- case LLM_ARCH_RWKV6: return true;
- case LLM_ARCH_RWKV6QWEN2: return true;
- case LLM_ARCH_RWKV7: return true;
- case LLM_ARCH_ARWKV7: return true;
- default: return false;
- }
- }
- const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
- return model->tensors_by_name;
- }
|