llama.cpp 419 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944794579467947794879497950795179527953795479557956795779587959796079617962796379647965796679677968796979707971797279737974797579767977797879797980798179827983798479857986798779887989799079917992799379947995799679977998799980008001800280038004800580068007800880098010801180128013801480158016801780188019802080218022802380248025802680278028802980308031803280338034803580368037803880398040804180428043804480458046804780488049805080518052805380548055805680578058805980608061806280638064806580668067806880698070807180728073807480758076807780788079808080818082808380848085808680878088808980908091809280938094809580968097809880998100810181028103810481058106810781088109811081118112811381148115811681178118811981208121812281238124812581268127812881298130813181328133813481358136813781388139814081418142814381448145814681478148814981508151815281538154815581568157815881598160816181628163816481658166816781688169817081718172817381748175817681778178817981808181818281838184818581868187818881898190819181928193819481958196819781988199820082018202820382048205820682078208820982108211821282138214821582168217821882198220822182228223822482258226822782288229823082318232823382348235823682378238823982408241824282438244824582468247824882498250825182528253825482558256825782588259826082618262826382648265826682678268826982708271827282738274827582768277827882798280828182828283828482858286828782888289829082918292829382948295829682978298829983008301830283038304830583068307830883098310831183128313831483158316831783188319832083218322832383248325832683278328832983308331833283338334833583368337833883398340834183428343834483458346834783488349835083518352835383548355835683578358835983608361836283638364836583668367836883698370837183728373837483758376837783788379838083818382838383848385838683878388838983908391839283938394839583968397839883998400840184028403840484058406840784088409841084118412841384148415841684178418841984208421842284238424842584268427842884298430843184328433843484358436843784388439844084418442844384448445844684478448844984508451845284538454845584568457845884598460846184628463846484658466846784688469847084718472847384748475847684778478847984808481848284838484848584868487848884898490849184928493849484958496849784988499850085018502850385048505850685078508850985108511851285138514851585168517851885198520852185228523852485258526852785288529853085318532853385348535853685378538853985408541854285438544854585468547854885498550855185528553855485558556855785588559856085618562856385648565856685678568856985708571857285738574857585768577857885798580858185828583858485858586858785888589859085918592859385948595859685978598859986008601860286038604860586068607860886098610861186128613861486158616861786188619862086218622862386248625862686278628862986308631863286338634863586368637863886398640864186428643864486458646864786488649865086518652865386548655865686578658865986608661866286638664866586668667866886698670867186728673867486758676867786788679868086818682868386848685868686878688868986908691869286938694869586968697869886998700870187028703870487058706870787088709871087118712871387148715871687178718871987208721872287238724872587268727872887298730873187328733873487358736873787388739874087418742874387448745874687478748874987508751875287538754875587568757875887598760876187628763876487658766876787688769877087718772877387748775877687778778877987808781878287838784878587868787878887898790879187928793879487958796879787988799880088018802880388048805880688078808880988108811881288138814881588168817881888198820882188228823882488258826882788288829883088318832883388348835883688378838883988408841884288438844884588468847884888498850885188528853885488558856885788588859886088618862886388648865886688678868886988708871887288738874887588768877887888798880888188828883888488858886888788888889889088918892889388948895889688978898889989008901890289038904890589068907890889098910891189128913891489158916891789188919892089218922892389248925892689278928892989308931893289338934893589368937893889398940894189428943894489458946894789488949895089518952895389548955895689578958895989608961896289638964896589668967896889698970897189728973897489758976897789788979898089818982898389848985898689878988898989908991899289938994899589968997899889999000900190029003900490059006900790089009901090119012901390149015901690179018901990209021902290239024902590269027902890299030903190329033903490359036903790389039904090419042904390449045904690479048904990509051905290539054905590569057905890599060906190629063906490659066906790689069907090719072907390749075907690779078907990809081908290839084908590869087908890899090909190929093909490959096909790989099910091019102910391049105910691079108910991109111911291139114911591169117911891199120912191229123912491259126912791289129913091319132913391349135913691379138913991409141914291439144914591469147914891499150915191529153915491559156915791589159916091619162916391649165916691679168916991709171917291739174917591769177917891799180918191829183918491859186918791889189919091919192919391949195919691979198919992009201920292039204920592069207920892099210921192129213921492159216921792189219922092219222922392249225922692279228922992309231923292339234923592369237923892399240924192429243924492459246924792489249925092519252925392549255925692579258925992609261926292639264926592669267926892699270927192729273927492759276927792789279928092819282928392849285928692879288928992909291929292939294929592969297929892999300930193029303930493059306930793089309931093119312931393149315931693179318931993209321932293239324932593269327932893299330933193329333933493359336933793389339934093419342934393449345934693479348934993509351935293539354935593569357935893599360936193629363936493659366936793689369937093719372937393749375937693779378937993809381938293839384938593869387938893899390939193929393939493959396939793989399940094019402940394049405940694079408940994109411941294139414941594169417941894199420942194229423942494259426942794289429943094319432943394349435943694379438943994409441944294439444944594469447944894499450945194529453945494559456945794589459946094619462946394649465946694679468946994709471947294739474947594769477947894799480948194829483948494859486948794889489949094919492949394949495949694979498949995009501950295039504950595069507950895099510951195129513951495159516951795189519952095219522952395249525952695279528952995309531953295339534953595369537953895399540954195429543954495459546954795489549955095519552955395549555955695579558955995609561956295639564956595669567956895699570957195729573957495759576957795789579958095819582958395849585958695879588958995909591959295939594959595969597959895999600960196029603960496059606960796089609961096119612961396149615961696179618961996209621962296239624962596269627962896299630963196329633963496359636963796389639964096419642964396449645964696479648964996509651965296539654965596569657965896599660966196629663966496659666966796689669967096719672967396749675967696779678967996809681968296839684968596869687968896899690969196929693969496959696969796989699970097019702970397049705970697079708970997109711971297139714971597169717971897199720972197229723972497259726972797289729973097319732973397349735973697379738973997409741974297439744974597469747974897499750975197529753975497559756975797589759976097619762976397649765976697679768976997709771977297739774977597769777977897799780978197829783978497859786978797889789979097919792979397949795979697979798979998009801980298039804980598069807980898099810981198129813981498159816981798189819982098219822982398249825982698279828982998309831983298339834983598369837983898399840984198429843984498459846984798489849985098519852985398549855985698579858985998609861986298639864986598669867986898699870987198729873987498759876987798789879988098819882988398849885988698879888988998909891989298939894989598969897989898999900990199029903990499059906990799089909991099119912991399149915991699179918991999209921992299239924992599269927992899299930993199329933993499359936993799389939994099419942994399449945994699479948994999509951995299539954995599569957995899599960996199629963996499659966996799689969997099719972997399749975997699779978997999809981998299839984998599869987998899899990999199929993999499959996999799989999100001000110002100031000410005100061000710008100091001010011100121001310014100151001610017100181001910020100211002210023100241002510026100271002810029100301003110032100331003410035100361003710038100391004010041100421004310044100451004610047100481004910050100511005210053100541005510056100571005810059100601006110062100631006410065100661006710068100691007010071100721007310074100751007610077100781007910080100811008210083100841008510086100871008810089100901009110092100931009410095100961009710098100991010010101101021010310104101051010610107101081010910110101111011210113101141011510116101171011810119101201012110122101231012410125101261012710128101291013010131101321013310134101351013610137101381013910140101411014210143101441014510146101471014810149101501015110152101531015410155101561015710158101591016010161101621016310164101651016610167101681016910170101711017210173101741017510176101771017810179101801018110182101831018410185101861018710188101891019010191101921019310194101951019610197101981019910200102011020210203102041020510206102071020810209102101021110212102131021410215102161021710218102191022010221102221022310224102251022610227102281022910230102311023210233102341023510236102371023810239102401024110242102431024410245102461024710248102491025010251102521025310254102551025610257102581025910260102611026210263102641026510266102671026810269102701027110272102731027410275102761027710278102791028010281102821028310284102851028610287102881028910290102911029210293102941029510296102971029810299103001030110302103031030410305103061030710308103091031010311103121031310314103151031610317103181031910320103211032210323103241032510326103271032810329103301033110332103331033410335103361033710338103391034010341103421034310344103451034610347103481034910350103511035210353103541035510356103571035810359103601036110362103631036410365103661036710368103691037010371103721037310374103751037610377103781037910380103811038210383103841038510386103871038810389103901039110392103931039410395103961039710398103991040010401104021040310404104051040610407104081040910410104111041210413104141041510416104171041810419104201042110422104231042410425104261042710428104291043010431104321043310434104351043610437104381043910440104411044210443104441044510446104471044810449104501045110452104531045410455104561045710458104591046010461104621046310464104651046610467104681046910470104711047210473104741047510476104771047810479104801048110482104831048410485104861048710488104891049010491104921049310494104951049610497104981049910500105011050210503105041050510506105071050810509105101051110512105131051410515105161051710518105191052010521105221052310524105251052610527105281052910530105311053210533105341053510536105371053810539105401054110542105431054410545105461054710548105491055010551105521055310554105551055610557105581055910560105611056210563105641056510566105671056810569105701057110572105731057410575105761057710578105791058010581105821058310584105851058610587105881058910590105911059210593105941059510596105971059810599106001060110602106031060410605106061060710608106091061010611106121061310614106151061610617106181061910620106211062210623106241062510626106271062810629106301063110632106331063410635106361063710638106391064010641106421064310644106451064610647106481064910650106511065210653106541065510656106571065810659106601066110662106631066410665106661066710668106691067010671106721067310674106751067610677106781067910680106811068210683106841068510686106871068810689106901069110692106931069410695106961069710698106991070010701107021070310704107051070610707107081070910710107111071210713107141071510716107171071810719107201072110722107231072410725107261072710728107291073010731107321073310734107351073610737107381073910740107411074210743107441074510746107471074810749107501075110752107531075410755107561075710758107591076010761107621076310764107651076610767107681076910770107711077210773107741077510776107771077810779107801078110782107831078410785107861078710788107891079010791107921079310794107951079610797107981079910800108011080210803108041080510806108071080810809108101081110812108131081410815108161081710818108191082010821108221082310824108251082610827108281082910830108311083210833108341083510836108371083810839108401084110842108431084410845108461084710848108491085010851108521085310854108551085610857108581085910860108611086210863108641086510866108671086810869108701087110872108731087410875108761087710878108791088010881108821088310884108851088610887108881088910890108911089210893108941089510896108971089810899109001090110902109031090410905109061090710908109091091010911109121091310914109151091610917109181091910920109211092210923109241092510926109271092810929109301093110932109331093410935109361093710938109391094010941109421094310944109451094610947109481094910950109511095210953109541095510956109571095810959109601096110962109631096410965109661096710968109691097010971109721097310974109751097610977109781097910980109811098210983109841098510986109871098810989109901099110992109931099410995109961099710998109991100011001110021100311004110051100611007110081100911010110111101211013110141101511016110171101811019110201102111022110231102411025110261102711028110291103011031110321103311034110351103611037110381103911040
  1. #define LLAMA_API_INTERNAL
  2. //#define LLAMA_GGML_BACKEND_CUDA_TEST // for testing only - enables ggml-cuda through ggml-backend, disables partial offloading
  3. #include "llama.h"
  4. #include "unicode.h"
  5. #include "ggml.h"
  6. #include "ggml-alloc.h"
  7. #include "ggml-backend.h"
  8. #ifdef GGML_USE_CUBLAS
  9. # include "ggml-cuda.h"
  10. #elif defined(GGML_USE_CLBLAST)
  11. # include "ggml-opencl.h"
  12. #endif
  13. #ifdef GGML_USE_METAL
  14. # include "ggml-metal.h"
  15. #endif
  16. #ifdef GGML_USE_MPI
  17. # include "ggml-mpi.h"
  18. #endif
  19. #ifndef QK_K
  20. # ifdef GGML_QKK_64
  21. # define QK_K 64
  22. # else
  23. # define QK_K 256
  24. # endif
  25. #endif
  26. #ifdef __has_include
  27. #if __has_include(<unistd.h>)
  28. #include <unistd.h>
  29. #if defined(_POSIX_MAPPED_FILES)
  30. #include <sys/mman.h>
  31. #include <fcntl.h>
  32. #endif
  33. #if defined(_POSIX_MEMLOCK_RANGE)
  34. #include <sys/resource.h>
  35. #endif
  36. #endif
  37. #endif
  38. #if defined(_WIN32)
  39. #define WIN32_LEAN_AND_MEAN
  40. #ifndef NOMINMAX
  41. #define NOMINMAX
  42. #endif
  43. #include <windows.h>
  44. #include <io.h>
  45. #endif
  46. #include <algorithm>
  47. #include <array>
  48. #include <cassert>
  49. #include <cinttypes>
  50. #include <climits>
  51. #include <cmath>
  52. #include <cstdarg>
  53. #include <cstddef>
  54. #include <cstdint>
  55. #include <cstdio>
  56. #include <cstring>
  57. #include <ctime>
  58. #include <forward_list>
  59. #include <fstream>
  60. #include <functional>
  61. #include <initializer_list>
  62. #include <map>
  63. #include <memory>
  64. #include <mutex>
  65. #include <numeric>
  66. #include <queue>
  67. #include <random>
  68. #include <regex>
  69. #include <set>
  70. #include <sstream>
  71. #include <thread>
  72. #include <type_traits>
  73. #include <unordered_map>
  74. #if defined(_MSC_VER)
  75. #pragma warning(disable: 4244 4267) // possible loss of data
  76. #endif
  77. #ifdef __GNUC__
  78. #ifdef __MINGW32__
  79. #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
  80. #else
  81. #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
  82. #endif
  83. #else
  84. #define LLAMA_ATTRIBUTE_FORMAT(...)
  85. #endif
  86. #define LLAMA_MAX_NODES 8192
  87. #define LLAMA_MAX_EXPERTS 8
  88. //
  89. // logging
  90. //
  91. LLAMA_ATTRIBUTE_FORMAT(2, 3)
  92. static void llama_log_internal (ggml_log_level level, const char* format, ...);
  93. static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
  94. #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
  95. #define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
  96. #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
  97. //
  98. // helpers
  99. //
  100. static size_t utf8_len(char src) {
  101. const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
  102. uint8_t highbits = static_cast<uint8_t>(src) >> 4;
  103. return lookup[highbits];
  104. }
  105. static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
  106. std::string result;
  107. for (size_t pos = 0; ; pos += search.length()) {
  108. auto new_pos = s.find(search, pos);
  109. if (new_pos == std::string::npos) {
  110. result += s.substr(pos, s.size() - pos);
  111. break;
  112. }
  113. result += s.substr(pos, new_pos - pos) + replace;
  114. pos = new_pos;
  115. }
  116. s = std::move(result);
  117. }
  118. static bool is_float_close(float a, float b, float abs_tol) {
  119. // Check for non-negative tolerance
  120. if (abs_tol < 0.0) {
  121. throw std::invalid_argument("Tolerance must be non-negative");
  122. }
  123. // Exact equality check
  124. if (a == b) {
  125. return true;
  126. }
  127. // Check for infinities
  128. if (std::isinf(a) || std::isinf(b)) {
  129. return false;
  130. }
  131. // Regular comparison using the provided absolute tolerance
  132. return std::fabs(b - a) <= abs_tol;
  133. }
  134. #ifdef GGML_USE_CPU_HBM
  135. #include <hbwmalloc.h>
  136. #endif
  137. static void zeros(std::ofstream & file, size_t n) {
  138. char zero = 0;
  139. for (size_t i = 0; i < n; ++i) {
  140. file.write(&zero, 1);
  141. }
  142. }
  143. LLAMA_ATTRIBUTE_FORMAT(1, 2)
  144. static std::string format(const char * fmt, ...) {
  145. va_list ap;
  146. va_list ap2;
  147. va_start(ap, fmt);
  148. va_copy(ap2, ap);
  149. int size = vsnprintf(NULL, 0, fmt, ap);
  150. GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
  151. std::vector<char> buf(size + 1);
  152. int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
  153. GGML_ASSERT(size2 == size);
  154. va_end(ap2);
  155. va_end(ap);
  156. return std::string(buf.data(), size);
  157. }
  158. //
  159. // gguf constants (sync with gguf.py)
  160. //
  161. enum llm_arch {
  162. LLM_ARCH_LLAMA,
  163. LLM_ARCH_FALCON,
  164. LLM_ARCH_BAICHUAN,
  165. LLM_ARCH_GPT2,
  166. LLM_ARCH_GPTJ,
  167. LLM_ARCH_GPTNEOX,
  168. LLM_ARCH_MPT,
  169. LLM_ARCH_STARCODER,
  170. LLM_ARCH_PERSIMMON,
  171. LLM_ARCH_REFACT,
  172. LLM_ARCH_BLOOM,
  173. LLM_ARCH_STABLELM,
  174. LLM_ARCH_QWEN,
  175. LLM_ARCH_PHI2,
  176. LLM_ARCH_PLAMO,
  177. LLM_ARCH_UNKNOWN,
  178. };
  179. static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
  180. { LLM_ARCH_LLAMA, "llama" },
  181. { LLM_ARCH_FALCON, "falcon" },
  182. { LLM_ARCH_GPT2, "gpt2" },
  183. { LLM_ARCH_GPTJ, "gptj" },
  184. { LLM_ARCH_GPTNEOX, "gptneox" },
  185. { LLM_ARCH_MPT, "mpt" },
  186. { LLM_ARCH_BAICHUAN, "baichuan" },
  187. { LLM_ARCH_STARCODER, "starcoder" },
  188. { LLM_ARCH_PERSIMMON, "persimmon" },
  189. { LLM_ARCH_REFACT, "refact" },
  190. { LLM_ARCH_BLOOM, "bloom" },
  191. { LLM_ARCH_STABLELM, "stablelm" },
  192. { LLM_ARCH_QWEN, "qwen" },
  193. { LLM_ARCH_PHI2, "phi2" },
  194. { LLM_ARCH_PLAMO, "plamo" },
  195. };
  196. enum llm_kv {
  197. LLM_KV_GENERAL_ARCHITECTURE,
  198. LLM_KV_GENERAL_QUANTIZATION_VERSION,
  199. LLM_KV_GENERAL_ALIGNMENT,
  200. LLM_KV_GENERAL_NAME,
  201. LLM_KV_GENERAL_AUTHOR,
  202. LLM_KV_GENERAL_URL,
  203. LLM_KV_GENERAL_DESCRIPTION,
  204. LLM_KV_GENERAL_LICENSE,
  205. LLM_KV_GENERAL_SOURCE_URL,
  206. LLM_KV_GENERAL_SOURCE_HF_REPO,
  207. LLM_KV_CONTEXT_LENGTH,
  208. LLM_KV_EMBEDDING_LENGTH,
  209. LLM_KV_BLOCK_COUNT,
  210. LLM_KV_FEED_FORWARD_LENGTH,
  211. LLM_KV_USE_PARALLEL_RESIDUAL,
  212. LLM_KV_TENSOR_DATA_LAYOUT,
  213. LLM_KV_EXPERT_COUNT,
  214. LLM_KV_EXPERT_USED_COUNT,
  215. LLM_KV_ATTENTION_HEAD_COUNT,
  216. LLM_KV_ATTENTION_HEAD_COUNT_KV,
  217. LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
  218. LLM_KV_ATTENTION_CLAMP_KQV,
  219. LLM_KV_ATTENTION_KEY_LENGTH,
  220. LLM_KV_ATTENTION_VALUE_LENGTH,
  221. LLM_KV_ATTENTION_LAYERNORM_EPS,
  222. LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
  223. LLM_KV_ROPE_DIMENSION_COUNT,
  224. LLM_KV_ROPE_FREQ_BASE,
  225. LLM_KV_ROPE_SCALE_LINEAR,
  226. LLM_KV_ROPE_SCALING_TYPE,
  227. LLM_KV_ROPE_SCALING_FACTOR,
  228. LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
  229. LLM_KV_ROPE_SCALING_FINETUNED,
  230. LLM_KV_TOKENIZER_MODEL,
  231. LLM_KV_TOKENIZER_LIST,
  232. LLM_KV_TOKENIZER_TOKEN_TYPE,
  233. LLM_KV_TOKENIZER_SCORES,
  234. LLM_KV_TOKENIZER_MERGES,
  235. LLM_KV_TOKENIZER_BOS_ID,
  236. LLM_KV_TOKENIZER_EOS_ID,
  237. LLM_KV_TOKENIZER_UNK_ID,
  238. LLM_KV_TOKENIZER_SEP_ID,
  239. LLM_KV_TOKENIZER_PAD_ID,
  240. LLM_KV_TOKENIZER_ADD_BOS,
  241. LLM_KV_TOKENIZER_ADD_EOS,
  242. LLM_KV_TOKENIZER_HF_JSON,
  243. LLM_KV_TOKENIZER_RWKV,
  244. };
  245. static std::map<llm_kv, std::string> LLM_KV_NAMES = {
  246. { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
  247. { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
  248. { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
  249. { LLM_KV_GENERAL_NAME, "general.name" },
  250. { LLM_KV_GENERAL_AUTHOR, "general.author" },
  251. { LLM_KV_GENERAL_URL, "general.url" },
  252. { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
  253. { LLM_KV_GENERAL_LICENSE, "general.license" },
  254. { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
  255. { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
  256. { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
  257. { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
  258. { LLM_KV_BLOCK_COUNT, "%s.block_count" },
  259. { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
  260. { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
  261. { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
  262. { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
  263. { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
  264. { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
  265. { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
  266. { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
  267. { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
  268. { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
  269. { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
  270. { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
  271. { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
  272. { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
  273. { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
  274. { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
  275. { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
  276. { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
  277. { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
  278. { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
  279. { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
  280. { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
  281. { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
  282. { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
  283. { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
  284. { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
  285. { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
  286. { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
  287. { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
  288. { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
  289. { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
  290. { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
  291. { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
  292. { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
  293. };
  294. struct LLM_KV {
  295. LLM_KV(llm_arch arch) : arch(arch) {}
  296. llm_arch arch;
  297. std::string operator()(llm_kv kv) const {
  298. return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str());
  299. }
  300. };
  301. enum llm_tensor {
  302. LLM_TENSOR_TOKEN_EMBD,
  303. LLM_TENSOR_TOKEN_EMBD_NORM,
  304. LLM_TENSOR_POS_EMBD,
  305. LLM_TENSOR_OUTPUT,
  306. LLM_TENSOR_OUTPUT_NORM,
  307. LLM_TENSOR_ROPE_FREQS,
  308. LLM_TENSOR_ATTN_Q,
  309. LLM_TENSOR_ATTN_K,
  310. LLM_TENSOR_ATTN_V,
  311. LLM_TENSOR_ATTN_QKV,
  312. LLM_TENSOR_ATTN_OUT,
  313. LLM_TENSOR_ATTN_NORM,
  314. LLM_TENSOR_ATTN_NORM_2,
  315. LLM_TENSOR_ATTN_ROT_EMBD,
  316. LLM_TENSOR_FFN_GATE_INP,
  317. LLM_TENSOR_FFN_NORM,
  318. LLM_TENSOR_FFN_GATE,
  319. LLM_TENSOR_FFN_DOWN,
  320. LLM_TENSOR_FFN_UP,
  321. LLM_TENSOR_FFN_ACT,
  322. LLM_TENSOR_FFN_DOWN_EXP,
  323. LLM_TENSOR_FFN_GATE_EXP,
  324. LLM_TENSOR_FFN_UP_EXP,
  325. LLM_TENSOR_ATTN_Q_NORM,
  326. LLM_TENSOR_ATTN_K_NORM,
  327. };
  328. static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
  329. {
  330. LLM_ARCH_LLAMA,
  331. {
  332. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  333. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  334. { LLM_TENSOR_OUTPUT, "output" },
  335. { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
  336. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  337. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  338. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  339. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  340. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  341. { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
  342. { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
  343. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  344. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  345. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  346. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  347. { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
  348. { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
  349. { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
  350. },
  351. },
  352. {
  353. LLM_ARCH_BAICHUAN,
  354. {
  355. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  356. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  357. { LLM_TENSOR_OUTPUT, "output" },
  358. { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
  359. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  360. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  361. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  362. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  363. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  364. { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
  365. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  366. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  367. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  368. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  369. },
  370. },
  371. {
  372. LLM_ARCH_FALCON,
  373. {
  374. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  375. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  376. { LLM_TENSOR_OUTPUT, "output" },
  377. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  378. { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
  379. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
  380. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  381. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  382. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  383. },
  384. },
  385. {
  386. LLM_ARCH_GPT2,
  387. {
  388. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  389. { LLM_TENSOR_POS_EMBD, "position_embd" },
  390. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  391. { LLM_TENSOR_OUTPUT, "output" },
  392. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  393. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
  394. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  395. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  396. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  397. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  398. },
  399. },
  400. {
  401. LLM_ARCH_GPTJ,
  402. {
  403. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  404. },
  405. },
  406. {
  407. LLM_ARCH_GPTNEOX,
  408. {
  409. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  410. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  411. { LLM_TENSOR_OUTPUT, "output" },
  412. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  413. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
  414. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  415. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  416. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  417. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  418. },
  419. },
  420. {
  421. LLM_ARCH_PERSIMMON,
  422. {
  423. { LLM_TENSOR_TOKEN_EMBD, "token_embd"},
  424. { LLM_TENSOR_OUTPUT_NORM, "output_norm"},
  425. { LLM_TENSOR_OUTPUT, "output"},
  426. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
  427. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
  428. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
  429. { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
  430. { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
  431. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
  432. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
  433. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
  434. { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
  435. },
  436. },
  437. {
  438. LLM_ARCH_MPT,
  439. {
  440. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  441. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  442. { LLM_TENSOR_OUTPUT, "output" },
  443. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  444. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  445. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
  446. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  447. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  448. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  449. { LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
  450. },
  451. },
  452. {
  453. LLM_ARCH_STARCODER,
  454. {
  455. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  456. { LLM_TENSOR_POS_EMBD, "position_embd" },
  457. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  458. { LLM_TENSOR_OUTPUT, "output" },
  459. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  460. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
  461. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  462. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  463. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  464. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  465. },
  466. },
  467. {
  468. LLM_ARCH_REFACT,
  469. {
  470. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  471. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  472. { LLM_TENSOR_OUTPUT, "output" },
  473. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  474. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  475. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  476. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  477. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  478. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  479. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  480. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  481. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  482. },
  483. },
  484. {
  485. LLM_ARCH_BLOOM,
  486. {
  487. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  488. { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
  489. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  490. { LLM_TENSOR_OUTPUT, "output" },
  491. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  492. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
  493. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  494. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  495. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  496. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  497. },
  498. },
  499. {
  500. LLM_ARCH_STABLELM,
  501. {
  502. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  503. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  504. { LLM_TENSOR_OUTPUT, "output" },
  505. { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
  506. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  507. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  508. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  509. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  510. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  511. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  512. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  513. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  514. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  515. },
  516. },
  517. {
  518. LLM_ARCH_QWEN,
  519. {
  520. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  521. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  522. { LLM_TENSOR_OUTPUT, "output" },
  523. { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
  524. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  525. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
  526. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  527. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  528. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  529. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  530. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  531. },
  532. },
  533. {
  534. LLM_ARCH_PHI2,
  535. {
  536. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  537. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  538. { LLM_TENSOR_OUTPUT, "output" },
  539. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  540. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
  541. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  542. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  543. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  544. },
  545. },
  546. {
  547. LLM_ARCH_PLAMO,
  548. {
  549. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  550. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  551. { LLM_TENSOR_OUTPUT, "output" },
  552. { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
  553. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  554. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  555. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  556. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  557. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  558. { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
  559. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  560. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  561. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  562. },
  563. },
  564. {
  565. LLM_ARCH_UNKNOWN,
  566. {
  567. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  568. },
  569. },
  570. };
  571. static llm_arch llm_arch_from_string(const std::string & name) {
  572. for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
  573. if (kv.second == name) {
  574. return kv.first;
  575. }
  576. }
  577. return LLM_ARCH_UNKNOWN;
  578. }
  579. // helper to handle gguf constants
  580. // usage:
  581. //
  582. // const auto tn = LLM_TN(LLM_ARCH_LLAMA);
  583. //
  584. // std::string name = tn(LLM_TENSOR_OUTPUT); -> "output"
  585. // std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias"); -> "token_embd.bias"
  586. // std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3); -> "blk.3.attn_norm.weight"
  587. //
  588. struct LLM_TN {
  589. LLM_TN(llm_arch arch) : arch(arch) {}
  590. llm_arch arch;
  591. std::string operator()(llm_tensor tensor) const {
  592. return LLM_TENSOR_NAMES[arch].at(tensor);
  593. }
  594. std::string operator()(llm_tensor tensor, const std::string & suffix) const {
  595. return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
  596. }
  597. std::string operator()(llm_tensor tensor, int bid) const {
  598. return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
  599. }
  600. std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
  601. return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
  602. }
  603. std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
  604. return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
  605. }
  606. };
  607. //
  608. // gguf helpers
  609. //
  610. static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = {
  611. { LLAMA_ROPE_SCALING_NONE, "none" },
  612. { LLAMA_ROPE_SCALING_LINEAR, "linear" },
  613. { LLAMA_ROPE_SCALING_YARN, "yarn" },
  614. };
  615. static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
  616. for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
  617. if (kv.second == name) {
  618. return kv.first;
  619. }
  620. }
  621. return LLAMA_ROPE_SCALING_UNSPECIFIED;
  622. }
  623. static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
  624. switch (type) {
  625. case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
  626. case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
  627. case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
  628. case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
  629. case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
  630. case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
  631. case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
  632. case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
  633. case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
  634. case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
  635. case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
  636. default: return format("unknown type %d", type);
  637. }
  638. }
  639. static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
  640. const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
  641. switch (type) {
  642. case GGUF_TYPE_STRING:
  643. return gguf_get_val_str(ctx_gguf, i);
  644. case GGUF_TYPE_ARRAY:
  645. {
  646. const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
  647. int arr_n = gguf_get_arr_n(ctx_gguf, i);
  648. const void * data = gguf_get_arr_data(ctx_gguf, i);
  649. std::stringstream ss;
  650. ss << "[";
  651. for (int j = 0; j < arr_n; j++) {
  652. if (arr_type == GGUF_TYPE_STRING) {
  653. std::string val = gguf_get_arr_str(ctx_gguf, i, j);
  654. // escape quotes
  655. replace_all(val, "\\", "\\\\");
  656. replace_all(val, "\"", "\\\"");
  657. ss << '"' << val << '"';
  658. } else if (arr_type == GGUF_TYPE_ARRAY) {
  659. ss << "???";
  660. } else {
  661. ss << gguf_data_to_str(arr_type, data, j);
  662. }
  663. if (j < arr_n - 1) {
  664. ss << ", ";
  665. }
  666. }
  667. ss << "]";
  668. return ss.str();
  669. }
  670. default:
  671. return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
  672. }
  673. }
  674. //
  675. // ggml helpers
  676. //
  677. static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
  678. struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
  679. if (plan.work_size > 0) {
  680. buf.resize(plan.work_size);
  681. plan.work_data = buf.data();
  682. }
  683. ggml_graph_compute(graph, &plan);
  684. }
  685. //
  686. // llama helpers
  687. //
  688. #if defined(_WIN32)
  689. static std::string llama_format_win_err(DWORD err) {
  690. LPSTR buf;
  691. size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
  692. NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
  693. if (!size) {
  694. return "FormatMessageA failed";
  695. }
  696. std::string ret(buf, size);
  697. LocalFree(buf);
  698. return ret;
  699. }
  700. #endif
  701. template <typename T>
  702. struct no_init {
  703. T value;
  704. no_init() { /* do nothing */ }
  705. };
  706. struct llama_file {
  707. // use FILE * so we don't have to re-open the file to mmap
  708. FILE * fp;
  709. size_t size;
  710. llama_file(const char * fname, const char * mode) {
  711. fp = std::fopen(fname, mode);
  712. if (fp == NULL) {
  713. throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
  714. }
  715. seek(0, SEEK_END);
  716. size = tell();
  717. seek(0, SEEK_SET);
  718. }
  719. size_t tell() const {
  720. #ifdef _WIN32
  721. __int64 ret = _ftelli64(fp);
  722. #else
  723. long ret = std::ftell(fp);
  724. #endif
  725. GGML_ASSERT(ret != -1); // this really shouldn't fail
  726. return (size_t) ret;
  727. }
  728. void seek(size_t offset, int whence) const {
  729. #ifdef _WIN32
  730. int ret = _fseeki64(fp, (__int64) offset, whence);
  731. #else
  732. int ret = std::fseek(fp, (long) offset, whence);
  733. #endif
  734. GGML_ASSERT(ret == 0); // same
  735. }
  736. void read_raw(void * ptr, size_t len) const {
  737. if (len == 0) {
  738. return;
  739. }
  740. errno = 0;
  741. std::size_t ret = std::fread(ptr, len, 1, fp);
  742. if (ferror(fp)) {
  743. throw std::runtime_error(format("read error: %s", strerror(errno)));
  744. }
  745. if (ret != 1) {
  746. throw std::runtime_error("unexpectedly reached end of file");
  747. }
  748. }
  749. uint32_t read_u32() const {
  750. uint32_t ret;
  751. read_raw(&ret, sizeof(ret));
  752. return ret;
  753. }
  754. void write_raw(const void * ptr, size_t len) const {
  755. if (len == 0) {
  756. return;
  757. }
  758. errno = 0;
  759. size_t ret = std::fwrite(ptr, len, 1, fp);
  760. if (ret != 1) {
  761. throw std::runtime_error(format("write error: %s", strerror(errno)));
  762. }
  763. }
  764. void write_u32(std::uint32_t val) const {
  765. write_raw(&val, sizeof(val));
  766. }
  767. ~llama_file() {
  768. if (fp) {
  769. std::fclose(fp);
  770. }
  771. }
  772. };
  773. struct llama_mmap {
  774. void * addr;
  775. size_t size;
  776. llama_mmap(const llama_mmap &) = delete;
  777. #ifdef _POSIX_MAPPED_FILES
  778. static constexpr bool SUPPORTED = true;
  779. // list of mapped fragments (first_offset, last_offset)
  780. std::vector<std::pair<size_t, size_t>> mapped_fragments;
  781. llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
  782. size = file->size;
  783. int fd = fileno(file->fp);
  784. int flags = MAP_SHARED;
  785. // prefetch/readahead impairs performance on NUMA systems
  786. if (numa) { prefetch = 0; }
  787. #ifdef __linux__
  788. // advise the kernel to read the file sequentially (increases readahead)
  789. if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
  790. LLAMA_LOG_WARN("warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n",
  791. strerror(errno));
  792. }
  793. if (prefetch) { flags |= MAP_POPULATE; }
  794. #endif
  795. addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
  796. if (addr == MAP_FAILED) { // NOLINT
  797. throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
  798. }
  799. if (prefetch > 0) {
  800. // advise the kernel to preload the mapped memory
  801. if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
  802. LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
  803. strerror(errno));
  804. }
  805. }
  806. if (numa) {
  807. // advise the kernel not to use readahead
  808. // (because the next page might not belong on the same node)
  809. if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
  810. LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
  811. strerror(errno));
  812. }
  813. }
  814. // initialize list of mapped_fragments
  815. mapped_fragments.emplace_back(0, file->size);
  816. }
  817. static void align_range(size_t * first, size_t * last, size_t page_size) {
  818. // align first to the next page
  819. size_t offset_in_page = *first & (page_size - 1);
  820. size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page;
  821. *first += offset_to_page;
  822. // align last to the previous page
  823. *last = *last & ~(page_size - 1);
  824. if (*last <= *first) {
  825. *last = *first;
  826. }
  827. }
  828. // partially unmap the file in the range [first, last)
  829. void unmap_fragment(size_t first, size_t last) {
  830. // note: this function must not be called multiple times with overlapping ranges
  831. // otherwise, there is a risk of invalidating addresses that have been repurposed for other mappings
  832. int page_size = sysconf(_SC_PAGESIZE);
  833. align_range(&first, &last, page_size);
  834. size_t len = last - first;
  835. if (len == 0) {
  836. return;
  837. }
  838. GGML_ASSERT(first % page_size == 0);
  839. GGML_ASSERT(last % page_size == 0);
  840. GGML_ASSERT(last > first);
  841. void * next_page_start = (uint8_t *) addr + first;
  842. // unmap the range
  843. if (munmap(next_page_start, len)) {
  844. LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
  845. }
  846. // update the list of mapped fragments to avoid unmapping the same range again in the destructor
  847. std::vector<std::pair<size_t, size_t>> new_mapped_fragments;
  848. for (const auto & frag : mapped_fragments) {
  849. if (frag.first < first && frag.second > last) {
  850. // the range is in the middle of the fragment, split it
  851. new_mapped_fragments.emplace_back(frag.first, first);
  852. new_mapped_fragments.emplace_back(last, frag.second);
  853. } else if (frag.first < first && frag.second > first) {
  854. // the range starts in the middle of the fragment
  855. new_mapped_fragments.emplace_back(frag.first, first);
  856. } else if (frag.first < last && frag.second > last) {
  857. // the range ends in the middle of the fragment
  858. new_mapped_fragments.emplace_back(last, frag.second);
  859. } else if (frag.first >= first && frag.second <= last) {
  860. // the range covers the entire fragment
  861. } else {
  862. // the range is outside the fragment
  863. new_mapped_fragments.push_back(frag);
  864. }
  865. }
  866. mapped_fragments = std::move(new_mapped_fragments);
  867. }
  868. ~llama_mmap() {
  869. for (const auto & frag : mapped_fragments) {
  870. if (munmap((char *) addr + frag.first, frag.second - frag.first)) {
  871. LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
  872. }
  873. }
  874. }
  875. #elif defined(_WIN32)
  876. static constexpr bool SUPPORTED = true;
  877. llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false) {
  878. GGML_UNUSED(numa);
  879. size = file->size;
  880. HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
  881. HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
  882. if (hMapping == NULL) {
  883. DWORD error = GetLastError();
  884. throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
  885. }
  886. addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
  887. DWORD error = GetLastError();
  888. CloseHandle(hMapping);
  889. if (addr == NULL) {
  890. throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
  891. }
  892. if (prefetch > 0) {
  893. // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
  894. BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
  895. HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
  896. // may fail on pre-Windows 8 systems
  897. pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
  898. if (pPrefetchVirtualMemory) {
  899. // advise the kernel to preload the mapped memory
  900. WIN32_MEMORY_RANGE_ENTRY range;
  901. range.VirtualAddress = addr;
  902. range.NumberOfBytes = (SIZE_T) std::min(size, prefetch);
  903. if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
  904. LLAMA_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n",
  905. llama_format_win_err(GetLastError()).c_str());
  906. }
  907. }
  908. }
  909. }
  910. void unmap_fragment(size_t first, size_t last) {
  911. // not supported
  912. GGML_UNUSED(first);
  913. GGML_UNUSED(last);
  914. }
  915. ~llama_mmap() {
  916. if (!UnmapViewOfFile(addr)) {
  917. LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n",
  918. llama_format_win_err(GetLastError()).c_str());
  919. }
  920. }
  921. #else
  922. static constexpr bool SUPPORTED = false;
  923. llama_mmap(struct llama_file * file, size_t prefetch = -1, bool numa = false) {
  924. GGML_UNUSED(file);
  925. GGML_UNUSED(prefetch);
  926. GGML_UNUSED(numa);
  927. throw std::runtime_error("mmap not supported");
  928. }
  929. void unmap_fragment(size_t first, size_t last) {
  930. GGML_UNUSED(first);
  931. GGML_UNUSED(last);
  932. throw std::runtime_error("mmap not supported");
  933. }
  934. #endif
  935. };
  936. // Represents some region of memory being locked using mlock or VirtualLock;
  937. // will automatically unlock on destruction.
  938. struct llama_mlock {
  939. void * addr = NULL;
  940. size_t size = 0;
  941. bool failed_already = false;
  942. llama_mlock() {}
  943. llama_mlock(const llama_mlock &) = delete;
  944. ~llama_mlock() {
  945. if (size) {
  946. raw_unlock(addr, size);
  947. }
  948. }
  949. void init(void * ptr) {
  950. GGML_ASSERT(addr == NULL && size == 0); // NOLINT
  951. addr = ptr;
  952. }
  953. void grow_to(size_t target_size) {
  954. GGML_ASSERT(addr);
  955. if (failed_already) {
  956. return;
  957. }
  958. size_t granularity = lock_granularity();
  959. target_size = (target_size + granularity - 1) & ~(granularity - 1);
  960. if (target_size > size) {
  961. if (raw_lock((uint8_t *) addr + size, target_size - size)) {
  962. size = target_size;
  963. } else {
  964. failed_already = true;
  965. }
  966. }
  967. }
  968. #ifdef _POSIX_MEMLOCK_RANGE
  969. static constexpr bool SUPPORTED = true;
  970. static size_t lock_granularity() {
  971. return (size_t) sysconf(_SC_PAGESIZE);
  972. }
  973. #ifdef __APPLE__
  974. #define MLOCK_SUGGESTION \
  975. "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
  976. "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
  977. #else
  978. #define MLOCK_SUGGESTION \
  979. "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
  980. #endif
  981. bool raw_lock(const void * addr, size_t size) const {
  982. if (!mlock(addr, size)) {
  983. return true;
  984. }
  985. char* errmsg = std::strerror(errno);
  986. bool suggest = (errno == ENOMEM);
  987. // Check if the resource limit is fine after all
  988. struct rlimit lock_limit;
  989. if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
  990. suggest = false;
  991. }
  992. if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
  993. suggest = false;
  994. }
  995. fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
  996. size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
  997. return false;
  998. }
  999. #undef MLOCK_SUGGESTION
  1000. static void raw_unlock(void * addr, size_t size) {
  1001. if (munlock(addr, size)) {
  1002. fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
  1003. }
  1004. }
  1005. #elif defined(_WIN32)
  1006. static constexpr bool SUPPORTED = true;
  1007. static size_t lock_granularity() {
  1008. SYSTEM_INFO si;
  1009. GetSystemInfo(&si);
  1010. return (size_t) si.dwPageSize;
  1011. }
  1012. bool raw_lock(void * ptr, size_t len) const {
  1013. for (int tries = 1; ; tries++) {
  1014. if (VirtualLock(ptr, len)) {
  1015. return true;
  1016. }
  1017. if (tries == 2) {
  1018. fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
  1019. len, size, llama_format_win_err(GetLastError()).c_str());
  1020. return false;
  1021. }
  1022. // It failed but this was only the first try; increase the working
  1023. // set size and try again.
  1024. SIZE_T min_ws_size, max_ws_size;
  1025. if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
  1026. fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
  1027. llama_format_win_err(GetLastError()).c_str());
  1028. return false;
  1029. }
  1030. // Per MSDN: "The maximum number of pages that a process can lock
  1031. // is equal to the number of pages in its minimum working set minus
  1032. // a small overhead."
  1033. // Hopefully a megabyte is enough overhead:
  1034. size_t increment = len + 1048576;
  1035. // The minimum must be <= the maximum, so we need to increase both:
  1036. min_ws_size += increment;
  1037. max_ws_size += increment;
  1038. if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
  1039. fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
  1040. llama_format_win_err(GetLastError()).c_str());
  1041. return false;
  1042. }
  1043. }
  1044. }
  1045. static void raw_unlock(void * ptr, size_t len) {
  1046. if (!VirtualUnlock(ptr, len)) {
  1047. fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
  1048. llama_format_win_err(GetLastError()).c_str());
  1049. }
  1050. }
  1051. #else
  1052. static constexpr bool SUPPORTED = false;
  1053. static size_t lock_granularity() {
  1054. return (size_t) 65536;
  1055. }
  1056. bool raw_lock(const void * addr, size_t len) const {
  1057. fprintf(stderr, "warning: mlock not supported on this system\n");
  1058. return false;
  1059. }
  1060. static void raw_unlock(const void * addr, size_t len) {}
  1061. #endif
  1062. };
  1063. typedef void (*offload_func_t)(struct ggml_tensor * tensor);
  1064. static void ggml_offload_nop(struct ggml_tensor * tensor) {
  1065. (void) tensor;
  1066. }
  1067. static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
  1068. std::vector<char> result(8, 0);
  1069. const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
  1070. if (n_tokens < 0) {
  1071. result.resize(-n_tokens);
  1072. int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
  1073. GGML_ASSERT(check == -n_tokens);
  1074. }
  1075. else {
  1076. result.resize(n_tokens);
  1077. }
  1078. return std::string(result.data(), result.size());
  1079. }
  1080. static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
  1081. ggml_backend_buffer_type_t buft = nullptr;
  1082. #ifdef GGML_USE_METAL
  1083. if (n_gpu_layers > 0) {
  1084. buft = ggml_backend_metal_buffer_type();
  1085. }
  1086. #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
  1087. if (n_gpu_layers > 0) {
  1088. buft = ggml_backend_cuda_buffer_type(0);
  1089. }
  1090. #elif defined(GGML_USE_CUBLAS)
  1091. buft = ggml_backend_cuda_host_buffer_type();
  1092. #elif defined(GGML_USE_CPU_HBM)
  1093. buft = ggml_backend_cpu_hbm_buffer_type();
  1094. #endif
  1095. if (buft == nullptr) {
  1096. buft = ggml_backend_cpu_buffer_type();
  1097. }
  1098. return buft;
  1099. GGML_UNUSED(n_gpu_layers);
  1100. }
  1101. //
  1102. // globals
  1103. //
  1104. struct llama_state {
  1105. llama_state() {
  1106. #ifdef GGML_USE_METAL
  1107. ggml_metal_log_set_callback(log_callback, log_callback_user_data);
  1108. #endif
  1109. }
  1110. // We save the log callback globally
  1111. ggml_log_callback log_callback = llama_log_callback_default;
  1112. void * log_callback_user_data = nullptr;
  1113. };
  1114. static llama_state g_state;
  1115. // available llama models
  1116. enum e_model {
  1117. MODEL_UNKNOWN,
  1118. MODEL_1B,
  1119. MODEL_3B,
  1120. MODEL_7B,
  1121. MODEL_8B,
  1122. MODEL_13B,
  1123. MODEL_15B,
  1124. MODEL_30B,
  1125. MODEL_34B,
  1126. MODEL_40B,
  1127. MODEL_65B,
  1128. MODEL_70B,
  1129. MODEL_SMALL,
  1130. MODEL_MEDIUM,
  1131. MODEL_LARGE,
  1132. MODEL_XL,
  1133. };
  1134. static const size_t kiB = 1024;
  1135. static const size_t MiB = 1024*kiB;
  1136. static const size_t GiB = 1024*MiB;
  1137. struct llama_hparams {
  1138. bool vocab_only;
  1139. uint32_t n_vocab;
  1140. uint32_t n_ctx_train; // context size the model was trained on
  1141. uint32_t n_embd;
  1142. uint32_t n_head;
  1143. uint32_t n_head_kv;
  1144. uint32_t n_layer;
  1145. uint32_t n_rot;
  1146. uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
  1147. uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
  1148. uint32_t n_ff;
  1149. uint32_t n_expert = 0;
  1150. uint32_t n_expert_used = 0;
  1151. float f_norm_eps;
  1152. float f_norm_rms_eps;
  1153. float rope_freq_base_train;
  1154. float rope_freq_scale_train;
  1155. uint32_t n_yarn_orig_ctx;
  1156. int8_t rope_scaling_type_train : 3;
  1157. bool rope_finetuned : 1;
  1158. float f_clamp_kqv;
  1159. float f_max_alibi_bias;
  1160. bool operator!=(const llama_hparams & other) const {
  1161. if (this->vocab_only != other.vocab_only) return true;
  1162. if (this->n_vocab != other.n_vocab) return true;
  1163. if (this->n_ctx_train != other.n_ctx_train) return true;
  1164. if (this->n_embd != other.n_embd) return true;
  1165. if (this->n_head != other.n_head) return true;
  1166. if (this->n_head_kv != other.n_head_kv) return true;
  1167. if (this->n_layer != other.n_layer) return true;
  1168. if (this->n_rot != other.n_rot) return true;
  1169. if (this->n_embd_head_k != other.n_embd_head_k) return true;
  1170. if (this->n_embd_head_v != other.n_embd_head_v) return true;
  1171. if (this->n_ff != other.n_ff) return true;
  1172. if (this->n_expert != other.n_expert) return true;
  1173. if (this->n_expert_used != other.n_expert_used) return true;
  1174. if (this->rope_finetuned != other.rope_finetuned) return true;
  1175. if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
  1176. const float EPSILON = 1e-9f;
  1177. if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
  1178. if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
  1179. if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
  1180. if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
  1181. return false;
  1182. }
  1183. uint32_t n_gqa() const {
  1184. return n_head/n_head_kv;
  1185. }
  1186. uint32_t n_embd_k_gqa() const { // dimension of key embeddings across all k-v heads
  1187. return n_embd_head_k * n_head_kv;
  1188. }
  1189. uint32_t n_embd_v_gqa() const { // dimension of value embeddings across all k-v heads
  1190. return n_embd_head_v * n_head_kv;
  1191. }
  1192. };
  1193. struct llama_cparams {
  1194. uint32_t n_ctx; // context size used during inference
  1195. uint32_t n_batch;
  1196. uint32_t n_threads; // number of threads to use for generation
  1197. uint32_t n_threads_batch; // number of threads to use for batch processing
  1198. float rope_freq_base;
  1199. float rope_freq_scale;
  1200. uint32_t n_yarn_orig_ctx;
  1201. // These hyperparameters are not exposed in GGUF, because all
  1202. // existing YaRN models use the same values for them.
  1203. float yarn_ext_factor;
  1204. float yarn_attn_factor;
  1205. float yarn_beta_fast;
  1206. float yarn_beta_slow;
  1207. bool mul_mat_q;
  1208. bool offload_kqv;
  1209. };
  1210. struct llama_layer {
  1211. // normalization
  1212. struct ggml_tensor * attn_norm;
  1213. struct ggml_tensor * attn_norm_b;
  1214. struct ggml_tensor * attn_norm_2;
  1215. struct ggml_tensor * attn_norm_2_b;
  1216. struct ggml_tensor * attn_q_norm;
  1217. struct ggml_tensor * attn_q_norm_b;
  1218. struct ggml_tensor * attn_k_norm;
  1219. struct ggml_tensor * attn_k_norm_b;
  1220. // attention
  1221. struct ggml_tensor * wq;
  1222. struct ggml_tensor * wk;
  1223. struct ggml_tensor * wv;
  1224. struct ggml_tensor * wo;
  1225. struct ggml_tensor * wqkv;
  1226. // attention bias
  1227. struct ggml_tensor * bq;
  1228. struct ggml_tensor * bk;
  1229. struct ggml_tensor * bv;
  1230. struct ggml_tensor * bo;
  1231. struct ggml_tensor * bqkv;
  1232. // normalization
  1233. struct ggml_tensor * ffn_norm;
  1234. struct ggml_tensor * ffn_norm_b;
  1235. // ff
  1236. struct ggml_tensor * ffn_gate; // w1
  1237. struct ggml_tensor * ffn_down; // w2
  1238. struct ggml_tensor * ffn_up; // w3
  1239. // ff MoE
  1240. struct ggml_tensor * ffn_gate_inp;
  1241. struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
  1242. struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
  1243. struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
  1244. // ff bias
  1245. struct ggml_tensor * ffn_down_b; // b2
  1246. struct ggml_tensor * ffn_up_b; // b3
  1247. struct ggml_tensor * ffn_act;
  1248. };
  1249. struct llama_kv_cell {
  1250. llama_pos pos = -1;
  1251. llama_pos delta = 0;
  1252. std::set<llama_seq_id> seq_id;
  1253. bool has_seq_id(const llama_seq_id & id) const {
  1254. return seq_id.find(id) != seq_id.end();
  1255. }
  1256. };
  1257. // ring-buffer of cached KV data
  1258. struct llama_kv_cache {
  1259. bool has_shift = false;
  1260. // Note: The value of head isn't only used to optimize searching
  1261. // for a free KV slot. llama_decode_internal also uses it, so it
  1262. // cannot be freely changed after a slot has been allocated.
  1263. uint32_t head = 0;
  1264. uint32_t size = 0;
  1265. uint32_t used = 0; // used cells (i.e. at least one seq_id)
  1266. // computed before each graph build
  1267. uint32_t n = 0;
  1268. std::vector<llama_kv_cell> cells;
  1269. std::vector<struct ggml_tensor *> k_l; // per layer
  1270. std::vector<struct ggml_tensor *> v_l;
  1271. struct ggml_context * ctx = NULL;
  1272. ggml_backend_buffer_t buf = NULL;
  1273. ~llama_kv_cache() {
  1274. #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
  1275. if (ggml_cublas_loaded()) {
  1276. for (size_t i = 0; i < k_l.size(); ++i) {
  1277. ggml_cuda_free_data(k_l[i]);
  1278. ggml_cuda_free_data(v_l[i]);
  1279. }
  1280. }
  1281. #endif
  1282. if (ctx) {
  1283. ggml_free(ctx);
  1284. }
  1285. ggml_backend_buffer_free(buf);
  1286. }
  1287. };
  1288. struct llama_vocab {
  1289. using id = int32_t;
  1290. using token = std::string;
  1291. using ttype = llama_token_type;
  1292. struct token_data {
  1293. token text;
  1294. float score;
  1295. ttype type;
  1296. };
  1297. enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
  1298. std::unordered_map<token, id> token_to_id;
  1299. std::vector<token_data> id_to_token;
  1300. std::unordered_map<token, id> special_tokens_cache;
  1301. std::map<std::pair<std::string, std::string>, int> bpe_ranks;
  1302. // default LLaMA special tokens
  1303. id special_bos_id = 1;
  1304. id special_eos_id = 2;
  1305. id special_unk_id = 0;
  1306. id special_sep_id = -1;
  1307. id special_pad_id = -1;
  1308. int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
  1309. int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
  1310. id linefeed_id = 13;
  1311. id special_prefix_id = 32007;
  1312. id special_middle_id = 32009;
  1313. id special_suffix_id = 32008;
  1314. id special_eot_id = 32010;
  1315. int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
  1316. GGML_ASSERT(token_left.find(' ') == std::string::npos);
  1317. GGML_ASSERT(token_left.find('\n') == std::string::npos);
  1318. GGML_ASSERT(token_right.find(' ') == std::string::npos);
  1319. GGML_ASSERT(token_right.find('\n') == std::string::npos);
  1320. auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
  1321. if (it == bpe_ranks.end()) {
  1322. return -1;
  1323. }
  1324. return it->second;
  1325. }
  1326. };
  1327. struct llama_model {
  1328. e_model type = MODEL_UNKNOWN;
  1329. llm_arch arch = LLM_ARCH_UNKNOWN;
  1330. llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
  1331. std::string name = "n/a";
  1332. llama_hparams hparams = {};
  1333. llama_vocab vocab;
  1334. struct ggml_tensor * tok_embd;
  1335. struct ggml_tensor * pos_embd;
  1336. struct ggml_tensor * tok_norm;
  1337. struct ggml_tensor * tok_norm_b;
  1338. struct ggml_tensor * output_norm;
  1339. struct ggml_tensor * output_norm_b;
  1340. struct ggml_tensor * output;
  1341. struct ggml_tensor * output_b;
  1342. std::vector<llama_layer> layers;
  1343. int n_gpu_layers;
  1344. // gguf metadata
  1345. std::unordered_map<std::string, std::string> gguf_kv;
  1346. // context
  1347. struct ggml_context * ctx = NULL;
  1348. // the model memory buffer
  1349. ggml_backend_buffer_t buf = NULL;
  1350. // model memory mapped file
  1351. std::unique_ptr<llama_mmap> mapping;
  1352. // objects representing data potentially being locked in memory
  1353. llama_mlock mlock_buf;
  1354. llama_mlock mlock_mmap;
  1355. // for quantize-stats only
  1356. std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
  1357. int64_t t_load_us = 0;
  1358. int64_t t_start_us = 0;
  1359. ~llama_model() {
  1360. #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
  1361. if (ggml_cublas_loaded()) {
  1362. for (size_t i = 0; i < tensors_by_name.size(); ++i) {
  1363. ggml_cuda_free_data(tensors_by_name[i].second);
  1364. }
  1365. ggml_cuda_free_scratch();
  1366. }
  1367. #endif
  1368. #if defined(GGML_USE_CLBLAST)
  1369. for (size_t i = 0; i < tensors_by_name.size(); ++i) {
  1370. ggml_cl_free_data(tensors_by_name[i].second);
  1371. }
  1372. #endif
  1373. if (ctx) {
  1374. ggml_free(ctx);
  1375. }
  1376. ggml_backend_buffer_free(buf);
  1377. }
  1378. };
  1379. struct llama_context {
  1380. llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
  1381. ~llama_context() {
  1382. ggml_allocr_free(alloc);
  1383. ggml_backend_buffer_free(buf_alloc);
  1384. ggml_backend_free(backend);
  1385. }
  1386. llama_cparams cparams;
  1387. ggml_backend_t backend = nullptr;
  1388. const llama_model & model;
  1389. // key + value cache for the self attention
  1390. struct llama_kv_cache kv_self;
  1391. std::mt19937 rng;
  1392. bool has_evaluated_once = false;
  1393. int64_t t_start_us;
  1394. int64_t t_load_us;
  1395. int64_t t_sample_us = 0;
  1396. int64_t t_p_eval_us = 0;
  1397. int64_t t_eval_us = 0;
  1398. int32_t n_sample = 0; // number of tokens sampled
  1399. int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
  1400. int32_t n_eval = 0; // number of eval calls
  1401. // decode output (2-dimensional array: [n_tokens][n_vocab])
  1402. std::vector<float> logits;
  1403. #ifndef NDEBUG
  1404. // guard against access to unset logits
  1405. std::vector<bool> logits_valid;
  1406. #endif
  1407. bool logits_all = false;
  1408. // input embedding (1-dimensional array: [n_embd])
  1409. std::vector<float> embedding;
  1410. // memory buffers used to evaluate the model
  1411. std::vector<uint8_t> buf_compute_meta;
  1412. ggml_backend_buffer_t buf_alloc = NULL;
  1413. ggml_allocr * alloc = NULL;
  1414. // temporary buffer for copying data to/from the backend
  1415. std::vector<no_init<uint8_t>> buf_copy;
  1416. #ifdef GGML_USE_MPI
  1417. ggml_mpi_context * ctx_mpi = NULL;
  1418. #endif
  1419. };
  1420. //
  1421. // kv cache helpers
  1422. //
  1423. static bool llama_kv_cache_init(
  1424. const struct llama_hparams & hparams,
  1425. struct llama_kv_cache & cache,
  1426. ggml_type ktype,
  1427. ggml_type vtype,
  1428. uint32_t n_ctx,
  1429. int n_gpu_layers,
  1430. bool offload) {
  1431. const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
  1432. const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
  1433. const uint32_t n_layer = hparams.n_layer;
  1434. cache.has_shift = false;
  1435. cache.head = 0;
  1436. cache.size = n_ctx;
  1437. cache.used = 0;
  1438. cache.cells.clear();
  1439. cache.cells.resize(n_ctx);
  1440. struct ggml_init_params params;
  1441. params.mem_size = 2u*n_layer*ggml_tensor_overhead();
  1442. params.mem_buffer = NULL;
  1443. params.no_alloc = true;
  1444. cache.ctx = ggml_init(params);
  1445. size_t vram_kv_cache = 0;
  1446. if (!cache.ctx) {
  1447. LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
  1448. return false;
  1449. }
  1450. cache.k_l.reserve(n_layer);
  1451. cache.v_l.reserve(n_layer);
  1452. const int i_gpu_start = (int) n_layer - n_gpu_layers;
  1453. for (int i = 0; i < (int) n_layer; i++) {
  1454. ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd_k_gqa*n_ctx);
  1455. ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd_v_gqa*n_ctx);
  1456. ggml_format_name(k, "cache_k_l%d", i);
  1457. ggml_format_name(v, "cache_v_l%d", i);
  1458. cache.k_l.push_back(k);
  1459. cache.v_l.push_back(v);
  1460. #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
  1461. if (i >= i_gpu_start) {
  1462. if (offload) {
  1463. ggml_cuda_assign_buffers_no_scratch(k);
  1464. ggml_cuda_assign_buffers_no_scratch(v);
  1465. vram_kv_cache += ggml_nbytes(k);
  1466. vram_kv_cache += ggml_nbytes(v);
  1467. // HACK: mark tensor as allocated
  1468. k->data = v->data = (void *)(uintptr_t)1;
  1469. }
  1470. }
  1471. #endif // GGML_USE_CUBLAS
  1472. }
  1473. // allocate tensors
  1474. cache.buf = ggml_backend_alloc_ctx_tensors_from_buft(cache.ctx, llama_default_buffer_type(n_gpu_layers));
  1475. // buf may be NULL with full offload
  1476. if (cache.buf) {
  1477. // initialize the buffer to avoid NaNs in the padding
  1478. ggml_backend_buffer_clear(cache.buf, 0);
  1479. }
  1480. if (vram_kv_cache > 0) {
  1481. LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
  1482. }
  1483. GGML_UNUSED(i_gpu_start);
  1484. GGML_UNUSED(offload);
  1485. return true;
  1486. }
  1487. // find an empty slot of size "n_tokens" in the cache
  1488. // updates the cache head
  1489. // Note: On success, it's important that cache.head points
  1490. // to the first cell of the slot.
  1491. static bool llama_kv_cache_find_slot(
  1492. struct llama_kv_cache & cache,
  1493. const struct llama_batch & batch) {
  1494. const uint32_t n_ctx = cache.size;
  1495. const uint32_t n_tokens = batch.n_tokens;
  1496. if (n_tokens > n_ctx) {
  1497. LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
  1498. return false;
  1499. }
  1500. uint32_t n_tested = 0;
  1501. while (true) {
  1502. if (cache.head + n_tokens > n_ctx) {
  1503. n_tested += n_ctx - cache.head;
  1504. cache.head = 0;
  1505. continue;
  1506. }
  1507. bool found = true;
  1508. for (uint32_t i = 0; i < n_tokens; i++) {
  1509. if (cache.cells[cache.head + i].pos >= 0) {
  1510. found = false;
  1511. cache.head += i + 1;
  1512. n_tested += i + 1;
  1513. break;
  1514. }
  1515. }
  1516. if (found) {
  1517. break;
  1518. }
  1519. if (n_tested >= n_ctx) {
  1520. //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
  1521. return false;
  1522. }
  1523. }
  1524. for (uint32_t i = 0; i < n_tokens; i++) {
  1525. cache.cells[cache.head + i].pos = batch.pos[i];
  1526. for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
  1527. cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i][j]);
  1528. }
  1529. }
  1530. cache.used += n_tokens;
  1531. return true;
  1532. }
  1533. // find how many cells are currently in use
  1534. static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
  1535. for (uint32_t i = cache.size - 1; i > 0; --i) {
  1536. if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
  1537. return i + 1;
  1538. }
  1539. }
  1540. return 0;
  1541. }
  1542. static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
  1543. for (int32_t i = 0; i < (int32_t) cache.size; ++i) {
  1544. cache.cells[i].pos = -1;
  1545. cache.cells[i].seq_id.clear();
  1546. }
  1547. cache.head = 0;
  1548. cache.used = 0;
  1549. }
  1550. static void llama_kv_cache_seq_rm(
  1551. struct llama_kv_cache & cache,
  1552. llama_seq_id seq_id,
  1553. llama_pos p0,
  1554. llama_pos p1) {
  1555. uint32_t new_head = cache.size;
  1556. if (p0 < 0) p0 = 0;
  1557. if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
  1558. for (uint32_t i = 0; i < cache.size; ++i) {
  1559. if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
  1560. if (seq_id < 0) {
  1561. cache.cells[i].seq_id.clear();
  1562. } else if (cache.cells[i].has_seq_id(seq_id)) {
  1563. cache.cells[i].seq_id.erase(seq_id);
  1564. } else {
  1565. continue;
  1566. }
  1567. if (cache.cells[i].seq_id.empty()) {
  1568. // keep count of the number of used cells
  1569. if (cache.cells[i].pos >= 0) cache.used--;
  1570. cache.cells[i].pos = -1;
  1571. if (new_head == cache.size) new_head = i;
  1572. }
  1573. }
  1574. }
  1575. // If we freed up a slot, set head to it so searching can start there.
  1576. if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
  1577. }
  1578. static void llama_kv_cache_seq_cp(
  1579. struct llama_kv_cache & cache,
  1580. llama_seq_id seq_id_src,
  1581. llama_seq_id seq_id_dst,
  1582. llama_pos p0,
  1583. llama_pos p1) {
  1584. if (p0 < 0) p0 = 0;
  1585. if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
  1586. cache.head = 0;
  1587. for (uint32_t i = 0; i < cache.size; ++i) {
  1588. if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
  1589. cache.cells[i].seq_id.insert(seq_id_dst);
  1590. }
  1591. }
  1592. }
  1593. static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
  1594. uint32_t new_head = cache.size;
  1595. for (uint32_t i = 0; i < cache.size; ++i) {
  1596. if (!cache.cells[i].has_seq_id(seq_id)) {
  1597. if (cache.cells[i].pos >= 0) cache.used--;
  1598. cache.cells[i].pos = -1;
  1599. cache.cells[i].seq_id.clear();
  1600. if (new_head == cache.size) new_head = i;
  1601. } else {
  1602. cache.cells[i].seq_id.clear();
  1603. cache.cells[i].seq_id.insert(seq_id);
  1604. }
  1605. }
  1606. // If we freed up a slot, set head to it so searching can start there.
  1607. if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
  1608. }
  1609. static void llama_kv_cache_seq_shift(
  1610. struct llama_kv_cache & cache,
  1611. llama_seq_id seq_id,
  1612. llama_pos p0,
  1613. llama_pos p1,
  1614. llama_pos delta) {
  1615. uint32_t new_head = cache.size;
  1616. if (p0 < 0) p0 = 0;
  1617. if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
  1618. for (uint32_t i = 0; i < cache.size; ++i) {
  1619. if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
  1620. cache.has_shift = true;
  1621. cache.cells[i].pos += delta;
  1622. cache.cells[i].delta += delta;
  1623. if (cache.cells[i].pos < 0) {
  1624. if (!cache.cells[i].seq_id.empty()) cache.used--;
  1625. cache.cells[i].pos = -1;
  1626. cache.cells[i].seq_id.clear();
  1627. if (new_head == cache.size) new_head = i;
  1628. }
  1629. }
  1630. }
  1631. // If we freed up a slot, set head to it so searching can start there.
  1632. // Otherwise we just start the next search from the beginning.
  1633. cache.head = new_head != cache.size ? new_head : 0;
  1634. }
  1635. static void llama_kv_cache_seq_div(
  1636. struct llama_kv_cache & cache,
  1637. llama_seq_id seq_id,
  1638. llama_pos p0,
  1639. llama_pos p1,
  1640. int d) {
  1641. if (p0 < 0) p0 = 0;
  1642. if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
  1643. for (uint32_t i = 0; i < cache.size; ++i) {
  1644. if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
  1645. cache.has_shift = true;
  1646. {
  1647. llama_pos p_old = cache.cells[i].pos;
  1648. cache.cells[i].pos /= d;
  1649. cache.cells[i].delta += cache.cells[i].pos - p_old;
  1650. }
  1651. }
  1652. }
  1653. }
  1654. //
  1655. // model loading and saving
  1656. //
  1657. enum llama_fver {
  1658. GGUF_FILE_VERSION_V1 = 1,
  1659. GGUF_FILE_VERSION_V2 = 2,
  1660. GGUF_FILE_VERSION_V3 = 3,
  1661. };
  1662. static const char * llama_file_version_name(llama_fver version) {
  1663. switch (version) {
  1664. case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
  1665. case GGUF_FILE_VERSION_V2: return "GGUF V2";
  1666. case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)";
  1667. }
  1668. return "unknown";
  1669. }
  1670. static std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
  1671. char buf[256];
  1672. snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0));
  1673. for (size_t i = 1; i < ne.size(); i++) {
  1674. snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i));
  1675. }
  1676. return buf;
  1677. }
  1678. static std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
  1679. char buf[256];
  1680. snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]);
  1681. for (int i = 1; i < GGML_MAX_DIMS; i++) {
  1682. snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]);
  1683. }
  1684. return buf;
  1685. }
  1686. namespace GGUFMeta {
  1687. template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int)>
  1688. struct GKV_Base_Type {
  1689. static constexpr gguf_type gt = gt_;
  1690. static T getter(const gguf_context * ctx, const int kid) {
  1691. return gfun(ctx, kid);
  1692. }
  1693. };
  1694. template<typename T> struct GKV_Base;
  1695. template<> struct GKV_Base<bool >: GKV_Base_Type<bool, GGUF_TYPE_BOOL, gguf_get_val_bool> {};
  1696. template<> struct GKV_Base<uint8_t >: GKV_Base_Type<uint8_t, GGUF_TYPE_UINT8, gguf_get_val_u8 > {};
  1697. template<> struct GKV_Base<uint16_t >: GKV_Base_Type<uint16_t, GGUF_TYPE_UINT16, gguf_get_val_u16 > {};
  1698. template<> struct GKV_Base<uint32_t >: GKV_Base_Type<uint32_t, GGUF_TYPE_UINT32, gguf_get_val_u32 > {};
  1699. template<> struct GKV_Base<uint64_t >: GKV_Base_Type<uint64_t, GGUF_TYPE_UINT64, gguf_get_val_u64 > {};
  1700. template<> struct GKV_Base<int8_t >: GKV_Base_Type<int8_t, GGUF_TYPE_INT8, gguf_get_val_i8 > {};
  1701. template<> struct GKV_Base<int16_t >: GKV_Base_Type<int16_t, GGUF_TYPE_INT16, gguf_get_val_i16 > {};
  1702. template<> struct GKV_Base<int32_t >: GKV_Base_Type<int32_t, GGUF_TYPE_INT32, gguf_get_val_i32 > {};
  1703. template<> struct GKV_Base<int64_t >: GKV_Base_Type<int64_t, GGUF_TYPE_INT64, gguf_get_val_i64 > {};
  1704. template<> struct GKV_Base<float >: GKV_Base_Type<float, GGUF_TYPE_FLOAT32, gguf_get_val_f32 > {};
  1705. template<> struct GKV_Base<double >: GKV_Base_Type<double, GGUF_TYPE_FLOAT64, gguf_get_val_f64 > {};
  1706. template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, GGUF_TYPE_STRING, gguf_get_val_str > {};
  1707. template<> struct GKV_Base<std::string> {
  1708. static constexpr gguf_type gt = GGUF_TYPE_STRING;
  1709. static std::string getter(const gguf_context * ctx, const int kid) {
  1710. return gguf_get_val_str(ctx, kid);
  1711. }
  1712. };
  1713. struct ArrayInfo{
  1714. const gguf_type gt;
  1715. const size_t length;
  1716. const void * data;
  1717. };
  1718. template<> struct GKV_Base<ArrayInfo> {
  1719. public:
  1720. static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
  1721. static ArrayInfo getter(const gguf_context *ctx, const int k) {
  1722. return ArrayInfo {
  1723. gguf_get_arr_type(ctx, k),
  1724. size_t(gguf_get_arr_n(ctx, k)),
  1725. gguf_get_arr_data(ctx, k),
  1726. };
  1727. }
  1728. };
  1729. template<typename T>
  1730. class GKV: public GKV_Base<T> {
  1731. GKV() = delete;
  1732. public:
  1733. static T get_kv(const gguf_context * ctx, const int k) {
  1734. const enum gguf_type kt = gguf_get_kv_type(ctx, k);
  1735. if (kt != GKV::gt) {
  1736. throw std::runtime_error(format("key %s has wrong type %s but expected type %s",
  1737. gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt)));
  1738. }
  1739. return GKV::getter(ctx, k);
  1740. }
  1741. static const char * override_type_to_str(const llama_model_kv_override_type ty) {
  1742. switch (ty) {
  1743. case LLAMA_KV_OVERRIDE_BOOL: return "bool";
  1744. case LLAMA_KV_OVERRIDE_INT: return "int";
  1745. case LLAMA_KV_OVERRIDE_FLOAT: return "float";
  1746. }
  1747. return "unknown";
  1748. }
  1749. static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) {
  1750. if (!override) { return false; }
  1751. if (override->tag == expected_type) {
  1752. LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
  1753. __func__, override_type_to_str(override->tag), override->key);
  1754. switch (override->tag) {
  1755. case LLAMA_KV_OVERRIDE_BOOL: {
  1756. printf("%s\n", override->bool_value ? "true" : "false");
  1757. } break;
  1758. case LLAMA_KV_OVERRIDE_INT: {
  1759. printf("%" PRId64 "\n", override->int_value);
  1760. } break;
  1761. case LLAMA_KV_OVERRIDE_FLOAT: {
  1762. printf("%.6f\n", override->float_value);
  1763. } break;
  1764. default:
  1765. // Shouldn't be possible to end up here, but just in case...
  1766. throw std::runtime_error(
  1767. format("Unsupported attempt to override %s type for metadata key %s\n",
  1768. override_type_to_str(override->tag), override->key));
  1769. }
  1770. return true;
  1771. }
  1772. LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
  1773. __func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag));
  1774. return false;
  1775. }
  1776. template<typename OT>
  1777. static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
  1778. try_override(OT & target, const struct llama_model_kv_override *override) {
  1779. if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) {
  1780. target = override->bool_value;
  1781. return true;
  1782. }
  1783. return false;
  1784. }
  1785. template<typename OT>
  1786. static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
  1787. try_override(OT & target, const struct llama_model_kv_override *override) {
  1788. if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) {
  1789. target = override->int_value;
  1790. return true;
  1791. }
  1792. return false;
  1793. }
  1794. template<typename OT>
  1795. static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
  1796. try_override(T & target, const struct llama_model_kv_override *override) {
  1797. if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) {
  1798. target = override->float_value;
  1799. return true;
  1800. }
  1801. return false;
  1802. }
  1803. template<typename OT>
  1804. static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
  1805. try_override(T & target, const struct llama_model_kv_override *override) {
  1806. (void)target;
  1807. (void)override;
  1808. if (!override) { return false; }
  1809. // Currently, we should never end up here so it would be a bug if we do.
  1810. throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
  1811. override ? override->key : "NULL"));
  1812. }
  1813. static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) {
  1814. if (try_override<T>(target, override)) {
  1815. return true;
  1816. }
  1817. if (k < 0) { return false; }
  1818. target = get_kv(ctx, k);
  1819. return true;
  1820. }
  1821. static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) {
  1822. return set(ctx, gguf_find_key(ctx, key), target, override);
  1823. }
  1824. static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) {
  1825. return set(ctx, key.c_str(), target, override);
  1826. }
  1827. };
  1828. }
  1829. struct llama_model_loader {
  1830. int n_kv = 0;
  1831. int n_tensors = 0;
  1832. int n_created = 0;
  1833. int64_t n_elements = 0;
  1834. size_t n_bytes = 0;
  1835. bool use_mmap = false;
  1836. llama_file file;
  1837. llama_ftype ftype;
  1838. llama_fver fver;
  1839. std::unique_ptr<llama_mmap> mapping;
  1840. std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
  1841. struct gguf_context * ctx_gguf = NULL;
  1842. struct ggml_context * ctx_meta = NULL;
  1843. std::string arch_name;
  1844. LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
  1845. llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
  1846. struct gguf_init_params params = {
  1847. /*.no_alloc = */ true,
  1848. /*.ctx = */ &ctx_meta,
  1849. };
  1850. if (param_overrides_p != nullptr) {
  1851. for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
  1852. kv_overrides.insert({std::string(p->key), *p});
  1853. }
  1854. }
  1855. ctx_gguf = gguf_init_from_file(fname.c_str(), params);
  1856. if (!ctx_gguf) {
  1857. throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
  1858. }
  1859. get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
  1860. llm_kv = LLM_KV(llm_arch_from_string(arch_name));
  1861. n_kv = gguf_get_n_kv(ctx_gguf);
  1862. n_tensors = gguf_get_n_tensors(ctx_gguf);
  1863. fver = (enum llama_fver ) gguf_get_version(ctx_gguf);
  1864. for (int i = 0; i < n_tensors; i++) {
  1865. const char * name = gguf_get_tensor_name(ctx_gguf, i);
  1866. struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
  1867. n_elements += ggml_nelements(t);
  1868. n_bytes += ggml_nbytes(t);
  1869. }
  1870. LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
  1871. __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
  1872. // determine file type based on the number of tensors for each quantization and print meta data
  1873. // TODO: make optional
  1874. {
  1875. std::map<enum ggml_type, uint32_t> n_type;
  1876. uint32_t n_type_max = 0;
  1877. enum ggml_type type_max = GGML_TYPE_F32;
  1878. for (int i = 0; i < n_tensors; i++) {
  1879. enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i);
  1880. n_type[type]++;
  1881. if (n_type_max < n_type[type]) {
  1882. n_type_max = n_type[type];
  1883. type_max = type;
  1884. }
  1885. // TODO: make runtime configurable
  1886. #if 0
  1887. struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
  1888. LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
  1889. #endif
  1890. }
  1891. switch (type_max) {
  1892. case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
  1893. case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
  1894. case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
  1895. case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
  1896. case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
  1897. case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break;
  1898. case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break;
  1899. case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break;
  1900. case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break;
  1901. case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
  1902. case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
  1903. case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
  1904. case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
  1905. case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
  1906. default:
  1907. {
  1908. LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
  1909. ftype = LLAMA_FTYPE_ALL_F32;
  1910. } break;
  1911. }
  1912. // this is a way to mark that we have "guessed" the file type
  1913. ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
  1914. {
  1915. const int kid = gguf_find_key(ctx_gguf, "general.file_type");
  1916. if (kid >= 0) {
  1917. ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid);
  1918. }
  1919. }
  1920. LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
  1921. for (int i = 0; i < n_kv; i++) {
  1922. const char * name = gguf_get_key(ctx_gguf, i);
  1923. const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
  1924. const std::string type_name =
  1925. type == GGUF_TYPE_ARRAY
  1926. ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
  1927. : gguf_type_name(type);
  1928. std::string value = gguf_kv_to_str(ctx_gguf, i);
  1929. const size_t MAX_VALUE_LEN = 40;
  1930. if (value.size() > MAX_VALUE_LEN) {
  1931. value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
  1932. }
  1933. replace_all(value, "\n", "\\n");
  1934. LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
  1935. }
  1936. // print type counts
  1937. for (auto & kv : n_type) {
  1938. if (kv.second == 0) {
  1939. continue;
  1940. }
  1941. LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
  1942. }
  1943. }
  1944. if (!llama_mmap::SUPPORTED) {
  1945. LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__);
  1946. use_mmap = false;
  1947. }
  1948. this->use_mmap = use_mmap;
  1949. }
  1950. ~llama_model_loader() {
  1951. if (ctx_gguf) {
  1952. gguf_free(ctx_gguf);
  1953. }
  1954. if (ctx_meta) {
  1955. ggml_free(ctx_meta);
  1956. }
  1957. }
  1958. template<typename T>
  1959. typename std::enable_if<std::is_integral<T>::value, bool>::type
  1960. get_arr_n(const std::string & key, T & result, const bool required = true) {
  1961. const int kid = gguf_find_key(ctx_gguf, key.c_str());
  1962. if (kid < 0) {
  1963. if (required) {
  1964. throw std::runtime_error(format("key not found in model: %s", key.c_str()));
  1965. }
  1966. return false;
  1967. }
  1968. struct GGUFMeta::ArrayInfo arr_info =
  1969. GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
  1970. result = arr_info.length;
  1971. return true;
  1972. }
  1973. template<typename T>
  1974. typename std::enable_if<std::is_integral<T>::value, bool>::type
  1975. get_arr_n(const enum llm_kv kid, T & result, const bool required = true) {
  1976. return get_arr_n(llm_kv(kid), result, required);
  1977. }
  1978. template<typename T>
  1979. bool get_key(const std::string & key, T & result, const bool required = true) {
  1980. auto it = kv_overrides.find(key);
  1981. const struct llama_model_kv_override * override =
  1982. it != kv_overrides.end() ? &it->second : nullptr;
  1983. const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
  1984. if (required && !found) {
  1985. throw std::runtime_error(format("key not found in model: %s", key.c_str()));
  1986. }
  1987. return found;
  1988. }
  1989. template<typename T>
  1990. bool get_key(const enum llm_kv kid, T & result, const bool required = true) {
  1991. return get_key(llm_kv(kid), result, required);
  1992. }
  1993. std::string get_arch_name() const {
  1994. return arch_name;
  1995. }
  1996. enum llm_arch get_arch() const {
  1997. return llm_kv.arch;
  1998. }
  1999. const char * get_tensor_name(int i) const {
  2000. return gguf_get_tensor_name(ctx_gguf, i);
  2001. }
  2002. struct ggml_tensor * get_tensor_meta(const char * name) const {
  2003. return ggml_get_tensor(ctx_meta, name);
  2004. }
  2005. struct ggml_tensor * get_tensor_meta(int i) const {
  2006. return get_tensor_meta(get_tensor_name(i));
  2007. }
  2008. struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
  2009. struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
  2010. tensor->backend = backend; // TODO: ggml_set_backend
  2011. ggml_set_name(tensor, ggml_get_name(meta));
  2012. n_created++;
  2013. return tensor;
  2014. }
  2015. struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend, bool required = true) {
  2016. struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
  2017. if (cur == NULL) {
  2018. if (!required) {
  2019. return NULL;
  2020. }
  2021. throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
  2022. }
  2023. if (backend == GGML_BACKEND_GPU_SPLIT) {
  2024. if (ne.size() == 1) {
  2025. throw std::runtime_error(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str()));
  2026. }
  2027. }
  2028. {
  2029. bool is_ok = true;
  2030. for (size_t i = 0; i < ne.size(); ++i) {
  2031. if (ne[i] != cur->ne[i]) {
  2032. is_ok = false;
  2033. break;
  2034. }
  2035. }
  2036. if (!is_ok) {
  2037. throw std::runtime_error(
  2038. format("%s: tensor '%s' has wrong shape; expected %s, got %s",
  2039. __func__, name.c_str(),
  2040. llama_format_tensor_shape(ne).c_str(),
  2041. llama_format_tensor_shape(cur).c_str()));
  2042. }
  2043. }
  2044. return create_tensor_for(ctx, cur, backend);
  2045. }
  2046. void done_getting_tensors() const {
  2047. if (n_created != n_tensors) {
  2048. throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
  2049. }
  2050. }
  2051. size_t file_offset(const char * name) const {
  2052. const int idx = gguf_find_tensor(ctx_gguf, name);
  2053. if (idx < 0) {
  2054. throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name));
  2055. }
  2056. return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
  2057. }
  2058. void init_mapping(bool prefetch = true) {
  2059. /*
  2060. // prefetch only CPU tensors
  2061. if (use_mmap) {
  2062. size_t size_pref = 0; // prefetch
  2063. for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
  2064. struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
  2065. if (cur->backend == GGML_BACKEND_CPU) {
  2066. size_t tensor_end = gguf_get_tensor_offset(ctx_gguf, i) + ggml_nbytes(cur);
  2067. size_pref = std::max(size_pref, tensor_end);
  2068. }
  2069. }
  2070. mapping.reset(new llama_mmap(&file, gguf_get_data_offset(ctx_gguf) + size_pref, ggml_is_numa()));
  2071. }
  2072. */
  2073. // prefetch the whole file - all the data is needed anyway
  2074. if (use_mmap) {
  2075. mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
  2076. }
  2077. }
  2078. // for backwards compatibility, does not support ggml-backend
  2079. void load_data_for(struct ggml_tensor * cur) const {
  2080. const size_t offs = file_offset(ggml_get_name(cur));
  2081. if (use_mmap && mapping) {
  2082. GGML_ASSERT(cur->data == nullptr);
  2083. cur->data = (uint8_t *)mapping->addr + offs;
  2084. } else {
  2085. GGML_ASSERT(cur->data != nullptr);
  2086. file.seek(offs, SEEK_SET);
  2087. file.read_raw(cur->data, ggml_nbytes(cur));
  2088. }
  2089. }
  2090. // Returns false if cancelled by progress_callback
  2091. bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
  2092. size_t size_data = 0;
  2093. for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
  2094. struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
  2095. size_data += ggml_nbytes(cur);
  2096. }
  2097. if (use_mmap && buf_mmap) {
  2098. if (lmlock) {
  2099. lmlock->init(mapping->addr);
  2100. }
  2101. }
  2102. #if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
  2103. const bool legacy_offload = true;
  2104. #else
  2105. const bool legacy_offload = false;
  2106. #endif
  2107. std::vector<no_init<uint8_t>> read_buf;
  2108. size_t size_done = 0;
  2109. size_t mmap_first = -1;
  2110. size_t mmap_last = 0;
  2111. for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
  2112. struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
  2113. GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
  2114. if (progress_callback) {
  2115. if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
  2116. return false;
  2117. }
  2118. }
  2119. const size_t offs = file_offset(ggml_get_name(cur));
  2120. if (!legacy_offload || cur->backend == GGML_BACKEND_CPU) {
  2121. if (use_mmap && mapping) {
  2122. if (buf_mmap) {
  2123. ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
  2124. if (lmlock) {
  2125. lmlock->grow_to(offs + ggml_nbytes(cur));
  2126. }
  2127. mmap_first = std::min(mmap_first, offs);
  2128. mmap_last = std::max(mmap_last, offs + ggml_nbytes(cur));
  2129. } else {
  2130. ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
  2131. }
  2132. } else {
  2133. if (ggml_backend_buffer_is_host(cur->buffer)) {
  2134. file.seek(offs, SEEK_SET);
  2135. file.read_raw(cur->data, ggml_nbytes(cur));
  2136. } else {
  2137. read_buf.resize(ggml_nbytes(cur));
  2138. file.seek(offs, SEEK_SET);
  2139. file.read_raw(read_buf.data(), ggml_nbytes(cur));
  2140. ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
  2141. }
  2142. }
  2143. } else {
  2144. // HACK: mark tensor as allocated
  2145. cur->data = (void *)(uintptr_t)1;
  2146. void * data;
  2147. if (use_mmap && mapping) {
  2148. data = (uint8_t *) mapping->addr + offs;
  2149. } else {
  2150. read_buf.resize(ggml_nbytes(cur));
  2151. file.seek(offs, SEEK_SET);
  2152. file.read_raw(read_buf.data(), ggml_nbytes(cur));
  2153. data = read_buf.data();
  2154. }
  2155. #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
  2156. ggml_cuda_transform_tensor(data, cur);
  2157. #elif defined(GGML_USE_CLBLAST)
  2158. GGML_ASSERT(cur->backend == GGML_BACKEND_GPU);
  2159. ggml_cl_transform_tensor(data, cur);
  2160. #else
  2161. GGML_ASSERT(!"GPU tensor without a GPU backend");
  2162. GGML_UNUSED(data);
  2163. #endif
  2164. }
  2165. size_done += ggml_nbytes(cur);
  2166. }
  2167. // unmap offloaded tensors and metadata
  2168. if (use_mmap && mapping) {
  2169. mapping->unmap_fragment(0, mmap_first);
  2170. mapping->unmap_fragment(mmap_last, mapping->size);
  2171. }
  2172. if (progress_callback) {
  2173. // Even though the model is done loading, we still honor
  2174. // cancellation since we need to free allocations.
  2175. return progress_callback(1.0f, progress_callback_user_data);
  2176. }
  2177. return true;
  2178. }
  2179. };
  2180. //
  2181. // load LLaMA models
  2182. //
  2183. static std::string llama_model_arch_name(llm_arch arch) {
  2184. auto it = LLM_ARCH_NAMES.find(arch);
  2185. if (it == LLM_ARCH_NAMES.end()) {
  2186. return "unknown";
  2187. }
  2188. return it->second;
  2189. }
  2190. static std::string llama_model_ftype_name(llama_ftype ftype) {
  2191. if (ftype & LLAMA_FTYPE_GUESSED) {
  2192. return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
  2193. }
  2194. switch (ftype) {
  2195. case LLAMA_FTYPE_ALL_F32: return "all F32";
  2196. case LLAMA_FTYPE_MOSTLY_F16: return "F16";
  2197. case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
  2198. case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
  2199. case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
  2200. return "Q4_1, some F16";
  2201. case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
  2202. case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
  2203. case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
  2204. // K-quants
  2205. case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
  2206. case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
  2207. case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
  2208. case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
  2209. case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
  2210. case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
  2211. case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
  2212. case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
  2213. case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
  2214. case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
  2215. case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
  2216. case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
  2217. default: return "unknown, may not work";
  2218. }
  2219. }
  2220. static const char * llama_model_type_name(e_model type) {
  2221. switch (type) {
  2222. case MODEL_1B: return "1B";
  2223. case MODEL_3B: return "3B";
  2224. case MODEL_7B: return "7B";
  2225. case MODEL_8B: return "8B";
  2226. case MODEL_13B: return "13B";
  2227. case MODEL_15B: return "15B";
  2228. case MODEL_30B: return "30B";
  2229. case MODEL_34B: return "34B";
  2230. case MODEL_40B: return "40B";
  2231. case MODEL_65B: return "65B";
  2232. case MODEL_70B: return "70B";
  2233. case MODEL_SMALL: return "0.1B";
  2234. case MODEL_MEDIUM: return "0.4B";
  2235. case MODEL_LARGE: return "0.8B";
  2236. case MODEL_XL: return "1.5B";
  2237. default: return "?B";
  2238. }
  2239. }
  2240. static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
  2241. model.arch = ml.get_arch();
  2242. if (model.arch == LLM_ARCH_UNKNOWN) {
  2243. throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
  2244. }
  2245. }
  2246. static void llm_load_hparams(
  2247. llama_model_loader & ml,
  2248. llama_model & model) {
  2249. auto & hparams = model.hparams;
  2250. const gguf_context * ctx = ml.ctx_gguf;
  2251. // get metadata as string
  2252. for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
  2253. enum gguf_type type = gguf_get_kv_type(ctx, i);
  2254. if (type == GGUF_TYPE_ARRAY) {
  2255. continue;
  2256. }
  2257. const char * name = gguf_get_key(ctx, i);
  2258. const std::string value = gguf_kv_to_str(ctx, i);
  2259. model.gguf_kv.emplace(name, value);
  2260. }
  2261. // get general kv
  2262. ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
  2263. // get hparams kv
  2264. ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
  2265. ml.get_key (LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
  2266. ml.get_key (LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
  2267. ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
  2268. ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
  2269. ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer);
  2270. ml.get_key (LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
  2271. ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
  2272. GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
  2273. GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
  2274. if (hparams.n_expert > 0) {
  2275. GGML_ASSERT(hparams.n_expert_used > 0);
  2276. } else {
  2277. GGML_ASSERT(hparams.n_expert_used == 0);
  2278. }
  2279. // n_head_kv is optional, default to n_head
  2280. hparams.n_head_kv = hparams.n_head;
  2281. ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv, false);
  2282. bool rope_finetuned = false;
  2283. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  2284. hparams.rope_finetuned = rope_finetuned;
  2285. hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
  2286. ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_yarn_orig_ctx, false);
  2287. // rope_freq_base (optional)
  2288. hparams.rope_freq_base_train = 10000.0f;
  2289. ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
  2290. std::string rope_scaling("linear");
  2291. ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
  2292. hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
  2293. GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
  2294. // rope_freq_scale (inverse of the kv) is optional
  2295. float ropescale = 0.0f;
  2296. if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
  2297. // try the old key name
  2298. ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
  2299. }
  2300. hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
  2301. // sanity check for n_rot (optional)
  2302. {
  2303. hparams.n_rot = hparams.n_embd / hparams.n_head;
  2304. ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
  2305. if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
  2306. if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
  2307. throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
  2308. }
  2309. }
  2310. // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
  2311. // gpt-j n_rot = rotary_dim
  2312. }
  2313. hparams.n_embd_head_k = hparams.n_embd / hparams.n_head;
  2314. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
  2315. hparams.n_embd_head_v = hparams.n_embd / hparams.n_head;
  2316. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
  2317. // arch-specific KVs
  2318. switch (model.arch) {
  2319. case LLM_ARCH_LLAMA:
  2320. {
  2321. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  2322. switch (hparams.n_layer) {
  2323. case 22: model.type = e_model::MODEL_1B; break;
  2324. case 26: model.type = e_model::MODEL_3B; break;
  2325. case 32: model.type = e_model::MODEL_7B; break;
  2326. case 40: model.type = e_model::MODEL_13B; break;
  2327. case 48: model.type = e_model::MODEL_34B; break;
  2328. case 60: model.type = e_model::MODEL_30B; break;
  2329. case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
  2330. default: model.type = e_model::MODEL_UNKNOWN;
  2331. }
  2332. } break;
  2333. case LLM_ARCH_FALCON:
  2334. {
  2335. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  2336. switch (hparams.n_layer) {
  2337. case 32: model.type = e_model::MODEL_7B; break;
  2338. case 60: model.type = e_model::MODEL_40B; break;
  2339. default: model.type = e_model::MODEL_UNKNOWN;
  2340. }
  2341. } break;
  2342. case LLM_ARCH_BAICHUAN:
  2343. {
  2344. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  2345. switch (hparams.n_layer) {
  2346. case 32: model.type = e_model::MODEL_7B; break;
  2347. case 40: model.type = e_model::MODEL_13B; break;
  2348. default: model.type = e_model::MODEL_UNKNOWN;
  2349. }
  2350. } break;
  2351. case LLM_ARCH_STARCODER:
  2352. {
  2353. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  2354. switch (hparams.n_layer) {
  2355. case 24: model.type = e_model::MODEL_1B; break;
  2356. case 36: model.type = e_model::MODEL_3B; break;
  2357. case 42: model.type = e_model::MODEL_7B; break;
  2358. case 40: model.type = e_model::MODEL_15B; break;
  2359. default: model.type = e_model::MODEL_UNKNOWN;
  2360. }
  2361. } break;
  2362. case LLM_ARCH_PERSIMMON:
  2363. {
  2364. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  2365. switch (hparams.n_layer) {
  2366. case 36: model.type = e_model::MODEL_8B; break;
  2367. default: model.type = e_model::MODEL_UNKNOWN;
  2368. }
  2369. } break;
  2370. case LLM_ARCH_REFACT:
  2371. {
  2372. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  2373. switch (hparams.n_layer) {
  2374. case 32: model.type = e_model::MODEL_1B; break;
  2375. default: model.type = e_model::MODEL_UNKNOWN;
  2376. }
  2377. } break;
  2378. case LLM_ARCH_BLOOM:
  2379. {
  2380. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  2381. switch (hparams.n_layer) {
  2382. case 24: model.type = e_model::MODEL_1B; break;
  2383. case 30:
  2384. switch (hparams.n_embd) {
  2385. case 2560: model.type = e_model::MODEL_3B; break;
  2386. case 4096: model.type = e_model::MODEL_7B; break;
  2387. } break;
  2388. }
  2389. } break;
  2390. case LLM_ARCH_MPT:
  2391. {
  2392. hparams.f_clamp_kqv = 0.0f;
  2393. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  2394. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  2395. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  2396. switch (hparams.n_layer) {
  2397. case 32: model.type = e_model::MODEL_7B; break;
  2398. case 48: model.type = e_model::MODEL_30B; break;
  2399. default: model.type = e_model::MODEL_UNKNOWN;
  2400. }
  2401. } break;
  2402. case LLM_ARCH_STABLELM:
  2403. {
  2404. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  2405. switch (hparams.n_layer) {
  2406. case 32: model.type = e_model::MODEL_3B; break;
  2407. default: model.type = e_model::MODEL_UNKNOWN;
  2408. }
  2409. } break;
  2410. case LLM_ARCH_QWEN:
  2411. {
  2412. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  2413. switch (hparams.n_layer) {
  2414. case 32: model.type = e_model::MODEL_7B; break;
  2415. case 40: model.type = e_model::MODEL_13B; break;
  2416. default: model.type = e_model::MODEL_UNKNOWN;
  2417. }
  2418. } break;
  2419. case LLM_ARCH_PHI2:
  2420. {
  2421. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  2422. switch (hparams.n_layer) {
  2423. case 24: model.type = e_model::MODEL_1B; break;
  2424. case 32: model.type = e_model::MODEL_3B; break;
  2425. default: model.type = e_model::MODEL_UNKNOWN;
  2426. }
  2427. } break;
  2428. case LLM_ARCH_PLAMO:
  2429. {
  2430. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  2431. switch (hparams.n_layer) {
  2432. case 40: model.type = e_model::MODEL_13B; break;
  2433. default: model.type = e_model::MODEL_UNKNOWN;
  2434. }
  2435. } break;
  2436. case LLM_ARCH_GPT2:
  2437. {
  2438. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  2439. switch (hparams.n_layer) {
  2440. case 12: model.type = e_model::MODEL_SMALL; break;
  2441. case 24: model.type = e_model::MODEL_MEDIUM; break;
  2442. case 36: model.type = e_model::MODEL_LARGE; break;
  2443. case 48: model.type = e_model::MODEL_XL; break;
  2444. default: model.type = e_model::MODEL_UNKNOWN;
  2445. }
  2446. } break;
  2447. default: (void)0;
  2448. }
  2449. model.ftype = ml.ftype;
  2450. }
  2451. // TODO: This should probably be in llama.h
  2452. static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false);
  2453. static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
  2454. static void llm_load_vocab(
  2455. llama_model_loader & ml,
  2456. llama_model & model) {
  2457. auto & vocab = model.vocab;
  2458. struct gguf_context * ctx = ml.ctx_gguf;
  2459. const auto kv = LLM_KV(model.arch);
  2460. const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
  2461. if (token_idx == -1) {
  2462. throw std::runtime_error("cannot find tokenizer vocab in model file\n");
  2463. }
  2464. const float * scores = nullptr;
  2465. const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
  2466. if (score_idx != -1) {
  2467. scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
  2468. }
  2469. const int * toktypes = nullptr;
  2470. const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
  2471. if (toktype_idx != -1) {
  2472. toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
  2473. }
  2474. // determine vocab type
  2475. {
  2476. std::string tokenizer_name;
  2477. ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
  2478. if (tokenizer_name == "llama") {
  2479. vocab.type = LLAMA_VOCAB_TYPE_SPM;
  2480. // default special tokens
  2481. vocab.special_bos_id = 1;
  2482. vocab.special_eos_id = 2;
  2483. vocab.special_unk_id = 0;
  2484. vocab.special_sep_id = -1;
  2485. vocab.special_pad_id = -1;
  2486. } else if (tokenizer_name == "gpt2") {
  2487. vocab.type = LLAMA_VOCAB_TYPE_BPE;
  2488. // read bpe merges and populate bpe ranks
  2489. const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
  2490. if (merges_keyidx == -1) {
  2491. throw std::runtime_error("cannot find tokenizer merges in model file\n");
  2492. }
  2493. const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
  2494. for (int i = 0; i < n_merges; i++) {
  2495. const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
  2496. GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
  2497. std::string first;
  2498. std::string second;
  2499. const size_t pos = word.find(' ', 1);
  2500. if (pos != std::string::npos) {
  2501. first = word.substr(0, pos);
  2502. second = word.substr(pos + 1);
  2503. }
  2504. vocab.bpe_ranks.emplace(std::make_pair(first, second), i);
  2505. }
  2506. // default special tokens
  2507. vocab.special_bos_id = 11;
  2508. vocab.special_eos_id = 11;
  2509. vocab.special_unk_id = -1;
  2510. vocab.special_sep_id = -1;
  2511. vocab.special_pad_id = -1;
  2512. } else {
  2513. LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
  2514. LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
  2515. vocab.type = LLAMA_VOCAB_TYPE_SPM;
  2516. }
  2517. }
  2518. const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
  2519. vocab.id_to_token.resize(n_vocab);
  2520. for (uint32_t i = 0; i < n_vocab; i++) {
  2521. std::string word = gguf_get_arr_str(ctx, token_idx, i);
  2522. GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
  2523. vocab.token_to_id[word] = i;
  2524. auto & token_data = vocab.id_to_token[i];
  2525. token_data.text = std::move(word);
  2526. token_data.score = scores ? scores[i] : 0.0f;
  2527. token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
  2528. }
  2529. GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
  2530. // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
  2531. if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
  2532. vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
  2533. } else {
  2534. const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
  2535. GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
  2536. vocab.linefeed_id = ids[0];
  2537. }
  2538. // special tokens
  2539. {
  2540. const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
  2541. { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
  2542. { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
  2543. { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
  2544. { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
  2545. { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
  2546. };
  2547. for (const auto & it : special_token_types) {
  2548. const std::string & key = kv(std::get<0>(it));
  2549. int32_t & id = std::get<1>(it);
  2550. uint32_t new_id;
  2551. if (!ml.get_key(std::get<0>(it), new_id, false)) {
  2552. continue;
  2553. }
  2554. if (new_id >= vocab.id_to_token.size()) {
  2555. LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
  2556. __func__, key.c_str(), new_id, id);
  2557. } else {
  2558. id = new_id;
  2559. }
  2560. }
  2561. // Handle add_bos_token and add_eos_token
  2562. {
  2563. bool temp = true;
  2564. if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
  2565. vocab.special_add_bos = int(temp);
  2566. }
  2567. if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
  2568. vocab.special_add_eos = int(temp);
  2569. }
  2570. }
  2571. }
  2572. // build special tokens cache
  2573. {
  2574. // TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type,
  2575. // and will always be correctly labeled in 'added_tokens.json' etc.
  2576. // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
  2577. // to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
  2578. // are special tokens.
  2579. // From testing, this appears to correlate 1:1 with special tokens.
  2580. //
  2581. // Counting special tokens and verifying in only one direction
  2582. // is sufficient to detect difference in those two sets.
  2583. //
  2584. uint32_t special_tokens_count_by_type = 0;
  2585. uint32_t special_tokens_count_from_verification = 0;
  2586. bool special_tokens_definition_mismatch = false;
  2587. for (const auto & t : vocab.token_to_id) {
  2588. const auto & token = t.first;
  2589. const auto & id = t.second;
  2590. // Count all non-normal tokens in the vocab while iterating
  2591. if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
  2592. special_tokens_count_by_type++;
  2593. }
  2594. // Skip single character tokens
  2595. if (token.length() > 1) {
  2596. bool is_tokenizable = false;
  2597. // Split token string representation in two, in all possible ways
  2598. // and check if both halves can be matched to a valid token
  2599. for (unsigned i = 1; i < token.length();) {
  2600. const auto left = token.substr(0, i);
  2601. const auto right = token.substr(i);
  2602. // check if we didnt partition in the middle of a utf sequence
  2603. auto utf = utf8_len(left.at(left.length() - 1));
  2604. if (utf == 1) {
  2605. if (vocab.token_to_id.find(left) != vocab.token_to_id.end() &&
  2606. vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
  2607. is_tokenizable = true;
  2608. break;
  2609. }
  2610. i++;
  2611. } else {
  2612. // skip over the rest of multibyte utf sequence
  2613. i += utf - 1;
  2614. }
  2615. }
  2616. if (!is_tokenizable) {
  2617. // Some tokens are multibyte, but they are utf sequences with equivalent text length of 1
  2618. // it's faster to re-filter them here, since there are way less candidates now
  2619. // Calculate a total "utf" length of a token string representation
  2620. size_t utf8_str_len = 0;
  2621. for (unsigned i = 0; i < token.length();) {
  2622. utf8_str_len++;
  2623. i += utf8_len(token.at(i));
  2624. }
  2625. // And skip the ones which are one character
  2626. if (utf8_str_len > 1) {
  2627. // At this point what we have left are special tokens only
  2628. vocab.special_tokens_cache[token] = id;
  2629. // Count manually found special tokens
  2630. special_tokens_count_from_verification++;
  2631. // If this manually found special token is not marked as such, flag a mismatch
  2632. if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) {
  2633. special_tokens_definition_mismatch = true;
  2634. }
  2635. }
  2636. }
  2637. }
  2638. }
  2639. if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) {
  2640. LLAMA_LOG_WARN("%s: mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
  2641. __func__,
  2642. special_tokens_count_from_verification, vocab.id_to_token.size(),
  2643. special_tokens_count_by_type, vocab.id_to_token.size()
  2644. );
  2645. } else {
  2646. LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
  2647. __func__,
  2648. special_tokens_count_from_verification, vocab.id_to_token.size()
  2649. );
  2650. }
  2651. }
  2652. }
  2653. static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
  2654. const auto & hparams = model.hparams;
  2655. const auto & vocab = model.vocab;
  2656. const auto rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
  2657. // hparams
  2658. LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
  2659. LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
  2660. LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
  2661. LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
  2662. LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
  2663. LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
  2664. LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
  2665. LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
  2666. LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
  2667. LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
  2668. LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
  2669. LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
  2670. LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
  2671. LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
  2672. LLAMA_LOG_INFO("%s: n_embd_k_gqa = %u\n", __func__, hparams.n_embd_k_gqa());
  2673. LLAMA_LOG_INFO("%s: n_embd_v_gqa = %u\n", __func__, hparams.n_embd_v_gqa());
  2674. LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
  2675. LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
  2676. LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
  2677. LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
  2678. LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
  2679. LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
  2680. LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
  2681. LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
  2682. LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
  2683. LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
  2684. LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
  2685. LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
  2686. LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
  2687. LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
  2688. if (ml.n_elements >= 1e12) {
  2689. LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
  2690. } else if (ml.n_elements >= 1e9) {
  2691. LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
  2692. } else if (ml.n_elements >= 1e6) {
  2693. LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, ml.n_elements*1e-6);
  2694. } else {
  2695. LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, ml.n_elements*1e-3);
  2696. }
  2697. if (ml.n_bytes < GiB) {
  2698. LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
  2699. } else {
  2700. LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
  2701. }
  2702. // general kv
  2703. LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
  2704. // special tokens
  2705. if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
  2706. if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
  2707. if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
  2708. if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
  2709. if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
  2710. if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
  2711. }
  2712. // Returns false if cancelled by progress_callback
  2713. static bool llm_load_tensors(
  2714. llama_model_loader & ml,
  2715. llama_model & model,
  2716. int n_gpu_layers,
  2717. int main_gpu,
  2718. const float * tensor_split,
  2719. bool use_mlock,
  2720. llama_progress_callback progress_callback,
  2721. void * progress_callback_user_data) {
  2722. model.t_start_us = ggml_time_us();
  2723. auto & ctx = model.ctx;
  2724. auto & hparams = model.hparams;
  2725. model.n_gpu_layers = n_gpu_layers;
  2726. size_t ctx_size = ggml_tensor_overhead() * ml.n_tensors;
  2727. LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
  2728. // create the ggml context
  2729. {
  2730. struct ggml_init_params params = {
  2731. /*.mem_size =*/ ctx_size,
  2732. /*.mem_buffer =*/ NULL,
  2733. /*.no_alloc =*/ true,
  2734. };
  2735. model.ctx = ggml_init(params);
  2736. if (!model.ctx) {
  2737. throw std::runtime_error(format("ggml_init() failed"));
  2738. }
  2739. }
  2740. (void) main_gpu;
  2741. enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
  2742. enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
  2743. #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
  2744. if (ggml_cublas_loaded()) {
  2745. LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
  2746. ggml_cuda_set_main_device(main_gpu);
  2747. llama_backend_offload = GGML_BACKEND_GPU;
  2748. llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
  2749. }
  2750. #elif defined(GGML_USE_CLBLAST)
  2751. LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
  2752. llama_backend_offload = GGML_BACKEND_GPU;
  2753. llama_backend_offload_split = GGML_BACKEND_GPU;
  2754. #endif
  2755. // create tensors for the weights
  2756. {
  2757. const int64_t n_embd = hparams.n_embd;
  2758. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
  2759. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
  2760. const int64_t n_layer = hparams.n_layer;
  2761. const int64_t n_vocab = hparams.n_vocab;
  2762. const auto tn = LLM_TN(model.arch);
  2763. switch (model.arch) {
  2764. case LLM_ARCH_LLAMA:
  2765. case LLM_ARCH_REFACT:
  2766. {
  2767. model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
  2768. // output
  2769. {
  2770. ggml_backend_type backend_norm;
  2771. ggml_backend_type backend_output;
  2772. if (n_gpu_layers > int(n_layer)) {
  2773. backend_norm = llama_backend_offload;
  2774. backend_output = llama_backend_offload_split;
  2775. } else {
  2776. backend_norm = GGML_BACKEND_CPU;
  2777. backend_output = GGML_BACKEND_CPU;
  2778. }
  2779. model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
  2780. model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
  2781. }
  2782. const uint32_t n_ff = hparams.n_ff;
  2783. const int64_t n_embd_gqa = n_embd_v_gqa;
  2784. GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
  2785. GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
  2786. const int i_gpu_start = n_layer - n_gpu_layers;
  2787. model.layers.resize(n_layer);
  2788. for (uint32_t i = 0; i < n_layer; ++i) {
  2789. const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
  2790. const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
  2791. auto & layer = model.layers[i];
  2792. layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
  2793. layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
  2794. layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
  2795. layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
  2796. layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
  2797. // optional bias tensors
  2798. layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend, false);
  2799. layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend, false);
  2800. layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend, false);
  2801. layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend, false);
  2802. layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
  2803. layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false);
  2804. if (layer.ffn_gate_inp == nullptr) {
  2805. GGML_ASSERT(hparams.n_expert == 0);
  2806. GGML_ASSERT(hparams.n_expert_used == 0);
  2807. layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
  2808. layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
  2809. layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
  2810. } else {
  2811. GGML_ASSERT(hparams.n_expert > 0);
  2812. GGML_ASSERT(hparams.n_expert_used > 0);
  2813. // MoE branch
  2814. for (uint32_t x = 0; x < hparams.n_expert; ++x) {
  2815. layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
  2816. layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split);
  2817. layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
  2818. }
  2819. }
  2820. }
  2821. } break;
  2822. case LLM_ARCH_BAICHUAN:
  2823. {
  2824. model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
  2825. {
  2826. ggml_backend_type backend_norm;
  2827. ggml_backend_type backend_output;
  2828. if (n_gpu_layers > int(n_layer)) {
  2829. backend_norm = llama_backend_offload;
  2830. backend_output = llama_backend_offload_split;
  2831. } else {
  2832. backend_norm = GGML_BACKEND_CPU;
  2833. backend_output = GGML_BACKEND_CPU;
  2834. }
  2835. model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
  2836. model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
  2837. }
  2838. const uint32_t n_ff = hparams.n_ff;
  2839. const int64_t n_embd_gqa = n_embd_v_gqa;
  2840. GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
  2841. GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
  2842. const int i_gpu_start = n_layer - n_gpu_layers;
  2843. model.layers.resize(n_layer);
  2844. for (uint32_t i = 0; i < n_layer; ++i) {
  2845. const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
  2846. const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
  2847. auto & layer = model.layers[i];
  2848. layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
  2849. layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
  2850. layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
  2851. layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
  2852. layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
  2853. layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
  2854. layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
  2855. layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
  2856. layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
  2857. }
  2858. } break;
  2859. case LLM_ARCH_FALCON:
  2860. {
  2861. model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
  2862. // output
  2863. {
  2864. ggml_backend_type backend_norm;
  2865. ggml_backend_type backend_output;
  2866. if (n_gpu_layers > int(n_layer)) {
  2867. backend_norm = llama_backend_offload;
  2868. backend_output = llama_backend_offload_split;
  2869. } else {
  2870. backend_norm = GGML_BACKEND_CPU;
  2871. backend_output = GGML_BACKEND_CPU;
  2872. }
  2873. model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
  2874. model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
  2875. model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
  2876. }
  2877. const uint32_t n_ff = hparams.n_ff;
  2878. const int64_t n_embd_gqa = n_embd_v_gqa;
  2879. GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
  2880. GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
  2881. const int i_gpu_start = n_layer - n_gpu_layers;
  2882. model.layers.resize(n_layer);
  2883. for (uint32_t i = 0; i < n_layer; ++i) {
  2884. const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
  2885. const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
  2886. auto & layer = model.layers[i];
  2887. layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
  2888. layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
  2889. if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
  2890. layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend);
  2891. layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
  2892. }
  2893. layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
  2894. layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
  2895. layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
  2896. layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
  2897. }
  2898. } break;
  2899. case LLM_ARCH_STARCODER:
  2900. {
  2901. model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
  2902. model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
  2903. // output
  2904. {
  2905. ggml_backend_type backend_norm;
  2906. ggml_backend_type backend_output;
  2907. if (n_gpu_layers > int(n_layer)) {
  2908. backend_norm = llama_backend_offload;
  2909. backend_output = llama_backend_offload_split;
  2910. } else {
  2911. backend_norm = GGML_BACKEND_CPU;
  2912. backend_output = GGML_BACKEND_CPU;
  2913. }
  2914. model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
  2915. model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
  2916. model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
  2917. }
  2918. const uint32_t n_ff = hparams.n_ff;
  2919. const int64_t n_embd_gqa = n_embd_v_gqa;
  2920. GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
  2921. GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
  2922. const int i_gpu_start = n_layer - n_gpu_layers;
  2923. model.layers.resize(n_layer);
  2924. for (uint32_t i = 0; i < n_layer; ++i) {
  2925. const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
  2926. const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
  2927. auto & layer = model.layers[i];
  2928. layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
  2929. layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
  2930. layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
  2931. layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
  2932. layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
  2933. layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
  2934. layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
  2935. layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
  2936. layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
  2937. layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
  2938. layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
  2939. layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
  2940. }
  2941. } break;
  2942. case LLM_ARCH_PERSIMMON:
  2943. {
  2944. model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
  2945. {
  2946. ggml_backend_type backend_norm;
  2947. ggml_backend_type backend_output;
  2948. if (n_gpu_layers > int(n_layer)) {
  2949. backend_norm = llama_backend_offload;
  2950. backend_output = llama_backend_offload_split;
  2951. } else {
  2952. backend_norm = GGML_BACKEND_CPU;
  2953. backend_output = GGML_BACKEND_CPU;
  2954. }
  2955. model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
  2956. model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
  2957. model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
  2958. }
  2959. const uint32_t n_ff = hparams.n_ff;
  2960. const int64_t n_embd_gqa = n_embd_v_gqa;
  2961. GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
  2962. GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
  2963. const int i_gpu_start = n_layer - n_gpu_layers;
  2964. model.layers.resize(n_layer);
  2965. for (uint32_t i = 0; i < n_layer; ++i) {
  2966. const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload;
  2967. const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split;
  2968. auto & layer = model.layers[i];
  2969. layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
  2970. layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
  2971. layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
  2972. layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
  2973. layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
  2974. layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
  2975. layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
  2976. layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
  2977. layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
  2978. layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
  2979. layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
  2980. layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
  2981. layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
  2982. layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend);
  2983. layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend);
  2984. layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
  2985. }
  2986. } break;
  2987. case LLM_ARCH_BLOOM:
  2988. {
  2989. model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
  2990. model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
  2991. model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
  2992. // output
  2993. {
  2994. ggml_backend_type backend_norm;
  2995. ggml_backend_type backend_output;
  2996. if (n_gpu_layers > int(n_layer)) {
  2997. backend_norm = llama_backend_offload;
  2998. backend_output = llama_backend_offload_split;
  2999. } else {
  3000. backend_norm = GGML_BACKEND_CPU;
  3001. backend_output = GGML_BACKEND_CPU;
  3002. }
  3003. model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
  3004. model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
  3005. model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
  3006. }
  3007. const uint32_t n_ff = hparams.n_ff;
  3008. const int64_t n_embd_gqa = n_embd_v_gqa;
  3009. GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
  3010. GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
  3011. const int i_gpu_start = n_layer - n_gpu_layers;
  3012. model.layers.resize(n_layer);
  3013. for (uint32_t i = 0; i < n_layer; ++i) {
  3014. const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
  3015. const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
  3016. auto & layer = model.layers[i];
  3017. layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
  3018. layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
  3019. layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
  3020. layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
  3021. layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
  3022. layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
  3023. layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
  3024. layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
  3025. layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
  3026. layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
  3027. layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
  3028. layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
  3029. }
  3030. } break;
  3031. case LLM_ARCH_MPT:
  3032. {
  3033. model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
  3034. // output
  3035. {
  3036. ggml_backend_type backend_norm;
  3037. ggml_backend_type backend_output;
  3038. if (n_gpu_layers > int(n_layer)) {
  3039. backend_norm = llama_backend_offload;
  3040. backend_output = llama_backend_offload_split;
  3041. } else {
  3042. backend_norm = GGML_BACKEND_CPU;
  3043. backend_output = GGML_BACKEND_CPU;
  3044. }
  3045. model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
  3046. model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
  3047. }
  3048. const uint32_t n_ff = hparams.n_ff;
  3049. const int64_t n_embd_gqa = n_embd_v_gqa;
  3050. GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
  3051. GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
  3052. const int i_gpu_start = n_layer - n_gpu_layers;
  3053. model.layers.resize(n_layer);
  3054. for (uint32_t i = 0; i < n_layer; ++i) {
  3055. const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
  3056. const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
  3057. auto & layer = model.layers[i];
  3058. layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
  3059. layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
  3060. layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
  3061. layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
  3062. layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
  3063. layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
  3064. // AWQ ScaleActivation layer
  3065. layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend, false);
  3066. }
  3067. } break;
  3068. case LLM_ARCH_STABLELM:
  3069. {
  3070. model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
  3071. // output
  3072. {
  3073. ggml_backend_type backend_norm;
  3074. ggml_backend_type backend_output;
  3075. if (n_gpu_layers > int(n_layer)) {
  3076. backend_norm = llama_backend_offload;
  3077. backend_output = llama_backend_offload_split;
  3078. } else {
  3079. backend_norm = GGML_BACKEND_CPU;
  3080. backend_output = GGML_BACKEND_CPU;
  3081. }
  3082. model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
  3083. model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
  3084. model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
  3085. }
  3086. const uint32_t n_ff = hparams.n_ff;
  3087. const int64_t n_embd_gqa = n_embd_v_gqa;
  3088. GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
  3089. GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
  3090. const int i_gpu_start = n_layer - n_gpu_layers;
  3091. model.layers.resize(n_layer);
  3092. for (uint32_t i = 0; i < n_layer; ++i) {
  3093. /*
  3094. llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ]
  3095. */
  3096. const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
  3097. const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
  3098. auto & layer = model.layers[i];
  3099. layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
  3100. layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
  3101. layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
  3102. layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
  3103. layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
  3104. layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
  3105. layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
  3106. layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
  3107. layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
  3108. layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
  3109. layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
  3110. }
  3111. } break;
  3112. case LLM_ARCH_QWEN:
  3113. {
  3114. model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
  3115. {
  3116. ggml_backend_type backend_norm;
  3117. ggml_backend_type backend_output;
  3118. if (n_gpu_layers > int(n_layer)) {
  3119. backend_norm = llama_backend_offload;
  3120. backend_output = llama_backend_offload_split;
  3121. } else {
  3122. backend_norm = GGML_BACKEND_CPU;
  3123. backend_output = GGML_BACKEND_CPU;
  3124. }
  3125. model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
  3126. model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
  3127. }
  3128. const uint32_t n_ff = hparams.n_ff / 2;
  3129. const int i_gpu_start = n_layer - n_gpu_layers;
  3130. model.layers.resize(n_layer);
  3131. for (uint32_t i = 0; i < n_layer; ++i) {
  3132. const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
  3133. const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
  3134. auto & layer = model.layers[i];
  3135. layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
  3136. layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}, backend_split);
  3137. layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd * 3}, backend);
  3138. layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
  3139. layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
  3140. layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
  3141. layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
  3142. layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
  3143. }
  3144. } break;
  3145. case LLM_ARCH_PHI2:
  3146. {
  3147. model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
  3148. // output
  3149. {
  3150. ggml_backend_type backend_norm;
  3151. ggml_backend_type backend_output;
  3152. if (n_gpu_layers > int(n_layer)) {
  3153. backend_norm = llama_backend_offload;
  3154. backend_output = llama_backend_offload;
  3155. } else {
  3156. backend_norm = GGML_BACKEND_CPU;
  3157. backend_output = GGML_BACKEND_CPU;
  3158. }
  3159. model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
  3160. model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
  3161. model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
  3162. model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
  3163. }
  3164. const uint32_t n_ff = hparams.n_ff;
  3165. const int64_t n_embd_gqa = n_embd_v_gqa;
  3166. GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
  3167. GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
  3168. const int i_gpu_start = n_layer - n_gpu_layers;
  3169. model.layers.resize(n_layer);
  3170. for (uint32_t i = 0; i < n_layer; ++i) {
  3171. const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
  3172. const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
  3173. auto & layer = model.layers[i];
  3174. layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
  3175. layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
  3176. layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
  3177. layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
  3178. layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
  3179. layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
  3180. layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
  3181. layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
  3182. layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
  3183. layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
  3184. }
  3185. } break;
  3186. case LLM_ARCH_PLAMO:
  3187. {
  3188. model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
  3189. // output
  3190. {
  3191. ggml_backend_type backend_norm;
  3192. ggml_backend_type backend_output;
  3193. if (n_gpu_layers > int(n_layer)) {
  3194. backend_norm = llama_backend_offload;
  3195. backend_output = llama_backend_offload_split;
  3196. } else {
  3197. backend_norm = GGML_BACKEND_CPU;
  3198. backend_output = GGML_BACKEND_CPU;
  3199. }
  3200. model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
  3201. model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
  3202. }
  3203. const uint32_t n_ff = hparams.n_ff;
  3204. const int64_t n_embd_gqa = n_embd_v_gqa;
  3205. GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
  3206. GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
  3207. const int i_gpu_start = n_layer - n_gpu_layers;
  3208. model.layers.resize(n_layer);
  3209. for (uint32_t i = 0; i < n_layer; ++i) {
  3210. const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
  3211. const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
  3212. auto & layer = model.layers[i];
  3213. layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
  3214. layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
  3215. layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
  3216. layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
  3217. layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
  3218. layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
  3219. layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
  3220. layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
  3221. }
  3222. } break;
  3223. case LLM_ARCH_GPT2:
  3224. {
  3225. model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
  3226. model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
  3227. // output
  3228. {
  3229. ggml_backend_type backend_norm;
  3230. ggml_backend_type backend_output;
  3231. if (n_gpu_layers > int(n_layer)) {
  3232. backend_norm = llama_backend_offload;
  3233. backend_output = llama_backend_offload_split;
  3234. } else {
  3235. backend_norm = GGML_BACKEND_CPU;
  3236. backend_output = GGML_BACKEND_CPU;
  3237. }
  3238. model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
  3239. model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
  3240. model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
  3241. }
  3242. const uint32_t n_ff = hparams.n_ff;
  3243. const int64_t n_embd_gqa = n_embd_v_gqa;
  3244. GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
  3245. GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
  3246. const int i_gpu_start = n_layer - n_gpu_layers;
  3247. model.layers.resize(n_layer);
  3248. for (uint32_t i = 0; i < n_layer; ++i) {
  3249. const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
  3250. const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
  3251. auto & layer = model.layers[i];
  3252. layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
  3253. layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
  3254. layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
  3255. layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
  3256. layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
  3257. layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
  3258. layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
  3259. layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
  3260. layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
  3261. layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
  3262. layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
  3263. layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
  3264. }
  3265. } break;
  3266. default:
  3267. throw std::runtime_error("unknown architecture");
  3268. }
  3269. }
  3270. ml.done_getting_tensors();
  3271. ml.init_mapping();
  3272. // allocate tensors
  3273. size_t vram_weights = 0;
  3274. size_t buf_size = 0;
  3275. ggml_backend_buffer_type_t buft = llama_default_buffer_type(n_gpu_layers);
  3276. for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
  3277. // GGML_BACKEND_GPU tensors are for CUDA and OpenCL only, which are handled separately without ggml-backend
  3278. if (t->backend == GGML_BACKEND_CPU) {
  3279. buf_size += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), ggml_backend_buft_get_alignment(buft));
  3280. } else {
  3281. vram_weights += ggml_nbytes(t);
  3282. }
  3283. }
  3284. // create backend buffer
  3285. ggml_backend_buffer_t buf_mmap = nullptr;
  3286. #ifdef GGML_USE_METAL
  3287. if (n_gpu_layers > 0) {
  3288. if (ml.use_mmap) {
  3289. const size_t max_size = ggml_get_max_tensor_size(ctx);
  3290. model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size);
  3291. buf_mmap = model.buf;
  3292. } else {
  3293. model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type());
  3294. }
  3295. }
  3296. #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
  3297. // for testing only
  3298. if (n_gpu_layers > 0) {
  3299. model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cuda_buffer_type(0));
  3300. }
  3301. #endif
  3302. if (model.buf == nullptr) {
  3303. // CPU backend, and indirectly CUDA and OpenCL
  3304. if (ml.use_mmap) {
  3305. model.buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size);
  3306. buf_mmap = model.buf;
  3307. } else {
  3308. // allocate only CPU tensors
  3309. model.buf = ggml_backend_buft_alloc_buffer(buft, buf_size);
  3310. ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(model.buf);
  3311. for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
  3312. if (t->backend == GGML_BACKEND_CPU) {
  3313. ggml_tallocr_alloc(alloc, t);
  3314. }
  3315. }
  3316. ggml_tallocr_free(alloc);
  3317. }
  3318. }
  3319. if (use_mlock && ggml_backend_buffer_is_host(model.buf)) {
  3320. model.mlock_buf.init (ggml_backend_buffer_get_base(model.buf));
  3321. model.mlock_buf.grow_to(ggml_backend_buffer_get_size(model.buf));
  3322. }
  3323. // print memory requirements
  3324. {
  3325. size_t sys_mem_required = ctx_size + buf_size;
  3326. if (sys_mem_required > 0) {
  3327. LLAMA_LOG_INFO("%s: system memory used = %7.2f MiB\n", __func__, sys_mem_required / 1024.0 / 1024.0);
  3328. }
  3329. if (vram_weights > 0) {
  3330. LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
  3331. }
  3332. #if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
  3333. const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
  3334. LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
  3335. if (n_gpu_layers > (int) hparams.n_layer) {
  3336. LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
  3337. }
  3338. const int max_backend_supported_layers = hparams.n_layer + 1;
  3339. const int max_offloadable_layers = hparams.n_layer + 1;
  3340. LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
  3341. #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
  3342. }
  3343. #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
  3344. ggml_cuda_set_tensor_split(tensor_split);
  3345. #else
  3346. GGML_UNUSED(tensor_split);
  3347. #endif // GGML_USE_CUBLAS
  3348. // populate tensors_by_name
  3349. for (int i = 0; i < ml.n_tensors; ++i) {
  3350. struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
  3351. model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
  3352. }
  3353. if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
  3354. return false;
  3355. }
  3356. model.mapping = std::move(ml.mapping);
  3357. // loading time will be recalculate after the first eval, so
  3358. // we take page faults deferred by mmap() into consideration
  3359. model.t_load_us = ggml_time_us() - model.t_start_us;
  3360. return true;
  3361. }
  3362. // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
  3363. static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
  3364. try {
  3365. llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
  3366. model.hparams.vocab_only = params.vocab_only;
  3367. llm_load_arch (ml, model);
  3368. llm_load_hparams(ml, model);
  3369. llm_load_vocab (ml, model);
  3370. llm_load_print_meta(ml, model);
  3371. if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
  3372. throw std::runtime_error("vocab size mismatch");
  3373. }
  3374. if (params.vocab_only) {
  3375. LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
  3376. return 0;
  3377. }
  3378. if (!llm_load_tensors(
  3379. ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
  3380. params.progress_callback, params.progress_callback_user_data
  3381. )) {
  3382. return -2;
  3383. }
  3384. } catch (const std::exception & err) {
  3385. LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
  3386. return -1;
  3387. }
  3388. return 0;
  3389. }
  3390. //
  3391. // llm_build
  3392. //
  3393. using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
  3394. enum llm_rope_type {
  3395. LLM_ROPE,
  3396. LLM_ROPE_NEOX,
  3397. LLM_ROPE_GLM,
  3398. };
  3399. enum llm_ffn_op_type {
  3400. LLM_FFN_SILU,
  3401. LLM_FFN_GELU,
  3402. LLM_FFN_RELU,
  3403. LLM_FFN_RELU_SQR,
  3404. };
  3405. enum llm_ffn_gate_type {
  3406. LLM_FFN_SEQ,
  3407. LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
  3408. };
  3409. enum llm_norm_type {
  3410. LLM_NORM,
  3411. LLM_NORM_RMS,
  3412. };
  3413. static struct ggml_tensor * llm_build_inp_embd(
  3414. struct ggml_context * ctx,
  3415. const llama_hparams & hparams,
  3416. const llama_batch & batch,
  3417. struct ggml_tensor * tok_embd,
  3418. const llm_build_cb & cb) {
  3419. const int64_t n_embd = hparams.n_embd;
  3420. struct ggml_tensor * inpL;
  3421. if (batch.token) {
  3422. struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
  3423. cb(inp_tokens, "inp_tokens", -1);
  3424. inpL = ggml_get_rows(ctx, tok_embd, inp_tokens);
  3425. } else {
  3426. #ifdef GGML_USE_MPI
  3427. GGML_ASSERT(false && "not implemented");
  3428. #endif
  3429. inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
  3430. }
  3431. return inpL;
  3432. }
  3433. // Persimmon: n_rot = n_embd_head_k/2
  3434. // Other: n_rot = n_embd_head_k
  3435. static void llm_build_k_shift(
  3436. struct ggml_context * ctx,
  3437. const llama_hparams & hparams,
  3438. const llama_cparams & cparams,
  3439. const llama_kv_cache & kv,
  3440. struct ggml_cgraph * graph,
  3441. llm_rope_type type,
  3442. int64_t n_ctx,
  3443. float freq_base,
  3444. float freq_scale,
  3445. const llm_build_cb & cb) {
  3446. const int64_t n_layer = hparams.n_layer;
  3447. const int64_t n_head_kv = hparams.n_head_kv;
  3448. const int64_t n_embd_head_k = hparams.n_embd_head_k;
  3449. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
  3450. const int32_t n_rot = hparams.n_rot;
  3451. const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
  3452. const float ext_factor = cparams.yarn_ext_factor;
  3453. const float attn_factor = cparams.yarn_attn_factor;
  3454. const float beta_fast = cparams.yarn_beta_fast;
  3455. const float beta_slow = cparams.yarn_beta_slow;
  3456. struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
  3457. cb(K_shift, "K_shift", -1);
  3458. int rope_type = 0;
  3459. switch (type) {
  3460. case LLM_ROPE: rope_type = 0; break;
  3461. case LLM_ROPE_NEOX: rope_type = 2; break;
  3462. case LLM_ROPE_GLM: rope_type = 4; break;
  3463. }
  3464. for (int il = 0; il < n_layer; ++il) {
  3465. struct ggml_tensor * tmp =
  3466. // we rotate only the first n_rot dimensions
  3467. ggml_rope_custom_inplace(ctx,
  3468. ggml_view_3d(ctx, kv.k_l[il],
  3469. n_embd_head_k, n_head_kv, n_ctx,
  3470. ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
  3471. ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
  3472. 0),
  3473. K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
  3474. ext_factor, attn_factor, beta_fast, beta_slow);
  3475. cb(tmp, "K_shifted", il);
  3476. ggml_build_forward_expand(graph, tmp);
  3477. }
  3478. }
  3479. static void llm_build_kv_store(
  3480. struct ggml_context * ctx,
  3481. const llama_hparams & hparams,
  3482. const llama_kv_cache & kv,
  3483. struct ggml_cgraph * graph,
  3484. struct ggml_tensor * k_cur,
  3485. struct ggml_tensor * v_cur,
  3486. int64_t n_ctx,
  3487. int32_t n_tokens,
  3488. int32_t kv_head,
  3489. const llm_build_cb & cb,
  3490. int64_t il) {
  3491. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
  3492. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
  3493. // compute the transposed [n_tokens, n_embd] V matrix
  3494. struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
  3495. //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
  3496. cb(v_cur_t, "v_cur_t", il);
  3497. struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
  3498. (ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
  3499. cb(k_cache_view, "k_cache_view", il);
  3500. struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
  3501. ( n_ctx)*ggml_element_size(kv.v_l[il]),
  3502. (kv_head)*ggml_element_size(kv.v_l[il]));
  3503. cb(v_cache_view, "v_cache_view", il);
  3504. // important: storing RoPE-ed version of K in the KV cache!
  3505. ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
  3506. ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
  3507. }
  3508. static struct ggml_tensor * llm_build_norm(
  3509. struct ggml_context * ctx,
  3510. struct ggml_tensor * cur,
  3511. const llama_hparams & hparams,
  3512. struct ggml_tensor * mw,
  3513. struct ggml_tensor * mb,
  3514. llm_norm_type type,
  3515. const llm_build_cb & cb,
  3516. int il) {
  3517. switch (type) {
  3518. case LLM_NORM: cur = ggml_norm (ctx, cur, hparams.f_norm_eps); break;
  3519. case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); break;
  3520. }
  3521. if (mw || mb) {
  3522. cb(cur, "norm", il);
  3523. }
  3524. if (mw) {
  3525. cur = ggml_mul(ctx, cur, mw);
  3526. if (mb) {
  3527. cb(cur, "norm_w", il);
  3528. }
  3529. }
  3530. if (mb) {
  3531. cur = ggml_add(ctx, cur, mb);
  3532. }
  3533. return cur;
  3534. }
  3535. static struct ggml_tensor * llm_build_ffn(
  3536. struct ggml_context * ctx,
  3537. struct ggml_tensor * cur,
  3538. struct ggml_tensor * up,
  3539. struct ggml_tensor * up_b,
  3540. struct ggml_tensor * gate,
  3541. struct ggml_tensor * gate_b,
  3542. struct ggml_tensor * down,
  3543. struct ggml_tensor * down_b,
  3544. struct ggml_tensor * act_scales,
  3545. llm_ffn_op_type type_op,
  3546. llm_ffn_gate_type type_gate,
  3547. const llm_build_cb & cb,
  3548. int il) {
  3549. struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
  3550. cb(tmp, "ffn_up", il);
  3551. if (up_b) {
  3552. tmp = ggml_add(ctx, tmp, up_b);
  3553. cb(tmp, "ffn_up_b", il);
  3554. }
  3555. if (gate) {
  3556. switch (type_gate) {
  3557. case LLM_FFN_SEQ:
  3558. {
  3559. cur = ggml_mul_mat(ctx, gate, tmp);
  3560. cb(cur, "ffn_gate", il);
  3561. } break;
  3562. case LLM_FFN_PAR:
  3563. {
  3564. cur = ggml_mul_mat(ctx, gate, cur);
  3565. cb(cur, "ffn_gate", il);
  3566. } break;
  3567. }
  3568. if (gate_b) {
  3569. cur = ggml_add(ctx, cur, gate_b);
  3570. cb(cur, "ffn_gate_b", il);
  3571. }
  3572. } else {
  3573. cur = tmp;
  3574. }
  3575. switch (type_op) {
  3576. case LLM_FFN_SILU:
  3577. {
  3578. cur = ggml_silu(ctx, cur);
  3579. cb(cur, "ffn_silu", il);
  3580. } break;
  3581. case LLM_FFN_GELU:
  3582. {
  3583. cur = ggml_gelu(ctx, cur);
  3584. cb(cur, "ffn_gelu", il);
  3585. if (act_scales != NULL) {
  3586. cur = ggml_div(ctx, cur, act_scales);
  3587. cb(cur, "ffn_act", il);
  3588. }
  3589. } break;
  3590. case LLM_FFN_RELU:
  3591. {
  3592. cur = ggml_relu(ctx, cur);
  3593. cb(cur, "ffn_relu", il);
  3594. } break;
  3595. case LLM_FFN_RELU_SQR:
  3596. {
  3597. cur = ggml_relu(ctx, cur);
  3598. cb(cur, "ffn_relu", il);
  3599. cur = ggml_sqr(ctx, cur);
  3600. cb(cur, "ffn_sqr(relu)", il);
  3601. } break;
  3602. }
  3603. if (type_gate == LLM_FFN_PAR) {
  3604. cur = ggml_mul(ctx, cur, tmp);
  3605. cb(cur, "ffn_gate_par", il);
  3606. }
  3607. cur = ggml_mul_mat(ctx, down, cur);
  3608. if (down_b) {
  3609. cb(cur, "ffn_down", il);
  3610. }
  3611. if (down_b) {
  3612. cur = ggml_add(ctx, cur, down_b);
  3613. }
  3614. return cur;
  3615. }
  3616. // if max_alibi_bias > 0 then apply ALiBi
  3617. static struct ggml_tensor * llm_build_kqv(
  3618. struct ggml_context * ctx,
  3619. const llama_model & model,
  3620. const llama_hparams & hparams,
  3621. const llama_kv_cache & kv,
  3622. struct ggml_tensor * wo,
  3623. struct ggml_tensor * wo_b,
  3624. struct ggml_tensor * q_cur,
  3625. struct ggml_tensor * kq_mask,
  3626. int64_t n_ctx,
  3627. int32_t n_tokens,
  3628. int32_t n_kv,
  3629. float max_alibi_bias,
  3630. float kq_scale,
  3631. const llm_build_cb & cb,
  3632. int il) {
  3633. const int64_t n_head = hparams.n_head;
  3634. const int64_t n_head_kv = hparams.n_head_kv;
  3635. const int64_t n_embd_head_k = hparams.n_embd_head_k;
  3636. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
  3637. const int64_t n_embd_head_v = hparams.n_embd_head_v;
  3638. struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
  3639. cb(q, "q", il);
  3640. struct ggml_tensor * k =
  3641. ggml_view_3d(ctx, kv.k_l[il],
  3642. n_embd_head_k, n_kv, n_head_kv,
  3643. ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
  3644. ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
  3645. 0);
  3646. cb(k, "k", il);
  3647. struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
  3648. cb(kq, "kq", il);
  3649. if (model.arch == LLM_ARCH_PHI2) {
  3650. // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
  3651. // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
  3652. ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
  3653. }
  3654. if (max_alibi_bias > 0.0f) {
  3655. // temporary branch until we figure out how to handle ggml_alibi through ggml_add
  3656. kq = ggml_scale(ctx, kq, kq_scale);
  3657. cb(kq, "kq_scaled", il);
  3658. if (max_alibi_bias > 0.0f) {
  3659. // TODO: n_head or n_head_kv
  3660. // TODO: K-shift is likely not working
  3661. // TODO: change to ggml_add
  3662. kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
  3663. cb(kq, "kq_scaled_alibi", il);
  3664. }
  3665. kq = ggml_add(ctx, kq, kq_mask);
  3666. cb(kq, "kq_masked", il);
  3667. kq = ggml_soft_max(ctx, kq);
  3668. cb(kq, "kq_soft_max", il);
  3669. } else {
  3670. kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale);
  3671. cb(kq, "kq_soft_max_ext", il);
  3672. }
  3673. // split cached v into n_head heads
  3674. struct ggml_tensor * v =
  3675. ggml_view_3d(ctx, kv.v_l[il],
  3676. n_kv, n_embd_head_v, n_head_kv,
  3677. ggml_element_size(kv.v_l[il])*n_ctx,
  3678. ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
  3679. 0);
  3680. cb(v, "v", il);
  3681. struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
  3682. cb(kqv, "kqv", il);
  3683. struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
  3684. cb(kqv_merged, "kqv_merged", il);
  3685. struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
  3686. cb(cur, "kqv_merged_cont", il);
  3687. cur = ggml_mul_mat(ctx, wo, cur);
  3688. if (wo_b) {
  3689. cb(cur, "kqv_wo", il);
  3690. }
  3691. if (wo_b) {
  3692. cur = ggml_add(ctx, cur, wo_b);
  3693. }
  3694. return cur;
  3695. }
  3696. struct llm_build_context {
  3697. const llama_model & model;
  3698. const llama_hparams & hparams;
  3699. const llama_cparams & cparams;
  3700. const llama_batch & batch;
  3701. const llama_kv_cache & kv_self;
  3702. const int64_t n_embd;
  3703. const int64_t n_layer;
  3704. const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
  3705. const int64_t n_head;
  3706. const int64_t n_head_kv;
  3707. const int64_t n_embd_head_k;
  3708. const int64_t n_embd_k_gqa;
  3709. const int64_t n_embd_head_v;
  3710. const int64_t n_embd_v_gqa;
  3711. const int64_t n_expert;
  3712. const int64_t n_expert_used;
  3713. const float freq_base;
  3714. const float freq_scale;
  3715. const float ext_factor;
  3716. const float attn_factor;
  3717. const float beta_fast;
  3718. const float beta_slow;
  3719. const float norm_eps;
  3720. const float norm_rms_eps;
  3721. const int32_t n_tokens;
  3722. const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx)
  3723. const int32_t kv_head; // index of where we store new KV data in the cache
  3724. const int32_t n_orig_ctx;
  3725. const bool do_rope_shift;
  3726. const llm_build_cb & cb;
  3727. std::vector<uint8_t> & buf_compute_meta;
  3728. struct ggml_context * ctx0 = nullptr;
  3729. // TODO: consider making the entire interface noexcept
  3730. llm_build_context(
  3731. llama_context & lctx,
  3732. const llama_batch & batch,
  3733. const llm_build_cb & cb,
  3734. bool worst_case) :
  3735. model (lctx.model),
  3736. hparams (model.hparams),
  3737. cparams (lctx.cparams),
  3738. batch (batch),
  3739. kv_self (lctx.kv_self),
  3740. n_embd (hparams.n_embd),
  3741. n_layer (hparams.n_layer),
  3742. n_ctx (cparams.n_ctx),
  3743. n_head (hparams.n_head),
  3744. n_head_kv (hparams.n_head_kv),
  3745. n_embd_head_k (hparams.n_embd_head_k),
  3746. n_embd_k_gqa (hparams.n_embd_k_gqa()),
  3747. n_embd_head_v (hparams.n_embd_head_v),
  3748. n_embd_v_gqa (hparams.n_embd_v_gqa()),
  3749. n_expert (hparams.n_expert),
  3750. n_expert_used (hparams.n_expert_used),
  3751. freq_base (cparams.rope_freq_base),
  3752. freq_scale (cparams.rope_freq_scale),
  3753. ext_factor (cparams.yarn_ext_factor),
  3754. attn_factor (cparams.yarn_attn_factor),
  3755. beta_fast (cparams.yarn_beta_fast),
  3756. beta_slow (cparams.yarn_beta_slow),
  3757. norm_eps (hparams.f_norm_eps),
  3758. norm_rms_eps (hparams.f_norm_rms_eps),
  3759. n_tokens (batch.n_tokens),
  3760. n_kv (worst_case ? n_ctx : kv_self.n),
  3761. kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
  3762. n_orig_ctx (cparams.n_yarn_orig_ctx),
  3763. do_rope_shift (worst_case || kv_self.has_shift),
  3764. cb (cb),
  3765. buf_compute_meta (lctx.buf_compute_meta) {
  3766. GGML_ASSERT(!!kv_self.ctx);
  3767. // all initializations should be done in init()
  3768. }
  3769. void init() {
  3770. struct ggml_init_params params = {
  3771. /*.mem_size =*/ buf_compute_meta.size(),
  3772. /*.mem_buffer =*/ buf_compute_meta.data(),
  3773. /*.no_alloc =*/ true,
  3774. };
  3775. ctx0 = ggml_init(params);
  3776. }
  3777. void free() {
  3778. if (ctx0) {
  3779. ggml_free(ctx0);
  3780. ctx0 = nullptr;
  3781. }
  3782. }
  3783. struct ggml_cgraph * build_llama() {
  3784. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
  3785. const int64_t n_embd_head = hparams.n_embd_head_v;
  3786. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  3787. GGML_ASSERT(n_embd_head == hparams.n_rot);
  3788. struct ggml_tensor * cur;
  3789. struct ggml_tensor * inpL;
  3790. inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
  3791. cb(inpL, "inp_embd", -1);
  3792. // inp_pos - contains the positions
  3793. struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
  3794. cb(inp_pos, "inp_pos", -1);
  3795. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  3796. struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
  3797. cb(KQ_mask, "KQ_mask", -1);
  3798. // shift the entire K-cache if needed
  3799. if (do_rope_shift) {
  3800. llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
  3801. }
  3802. for (int il = 0; il < n_layer; ++il) {
  3803. struct ggml_tensor * inpSA = inpL;
  3804. // norm
  3805. cur = llm_build_norm(ctx0, inpL, hparams,
  3806. model.layers[il].attn_norm, NULL,
  3807. LLM_NORM_RMS, cb, il);
  3808. cb(cur, "attn_norm", il);
  3809. // self-attention
  3810. {
  3811. // compute Q and K and RoPE them
  3812. struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
  3813. cb(Qcur, "Qcur", il);
  3814. if (model.layers[il].bq) {
  3815. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  3816. cb(Qcur, "Qcur", il);
  3817. }
  3818. struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
  3819. cb(Kcur, "Kcur", il);
  3820. if (model.layers[il].bk) {
  3821. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  3822. cb(Kcur, "Kcur", il);
  3823. }
  3824. struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
  3825. cb(Vcur, "Vcur", il);
  3826. if (model.layers[il].bv) {
  3827. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  3828. cb(Vcur, "Vcur", il);
  3829. }
  3830. Qcur = ggml_rope_custom(
  3831. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
  3832. hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
  3833. ext_factor, attn_factor, beta_fast, beta_slow
  3834. );
  3835. cb(Qcur, "Qcur", il);
  3836. Kcur = ggml_rope_custom(
  3837. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
  3838. hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
  3839. ext_factor, attn_factor, beta_fast, beta_slow
  3840. );
  3841. cb(Kcur, "Kcur", il);
  3842. llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
  3843. cur = llm_build_kqv(ctx0, model, hparams, kv_self,
  3844. model.layers[il].wo, model.layers[il].bo,
  3845. Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  3846. cb(cur, "kqv_out", il);
  3847. }
  3848. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  3849. cb(ffn_inp, "ffn_inp", il);
  3850. // feed-forward network
  3851. if (model.layers[il].ffn_gate_inp == nullptr) {
  3852. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  3853. model.layers[il].ffn_norm, NULL,
  3854. LLM_NORM_RMS, cb, il);
  3855. cb(cur, "ffn_norm", il);
  3856. cur = llm_build_ffn(ctx0, cur,
  3857. model.layers[il].ffn_up, NULL,
  3858. model.layers[il].ffn_gate, NULL,
  3859. model.layers[il].ffn_down, NULL,
  3860. NULL,
  3861. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  3862. cb(cur, "ffn_out", il);
  3863. } else {
  3864. // MoE branch
  3865. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  3866. model.layers[il].ffn_norm, NULL,
  3867. LLM_NORM_RMS, cb, il);
  3868. cb(cur, "ffn_norm", il);
  3869. ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
  3870. cb(logits, "ffn_moe_logits", il);
  3871. ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
  3872. cb(probs, "ffn_moe_probs", il);
  3873. // select experts
  3874. ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
  3875. cb(selected_experts->src[0], "ffn_moe_argsort", il);
  3876. ggml_tensor * weights = ggml_get_rows(ctx0,
  3877. ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
  3878. cb(weights, "ffn_moe_weights", il);
  3879. weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
  3880. ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
  3881. cb(weights_sum, "ffn_moe_weights_sum", il);
  3882. weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
  3883. cb(weights, "ffn_moe_weights_norm", il);
  3884. // compute expert outputs
  3885. ggml_tensor * moe_out = nullptr;
  3886. for (int i = 0; i < n_expert_used; ++i) {
  3887. ggml_tensor * cur_expert;
  3888. ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
  3889. cb(cur_up, "ffn_moe_up", il);
  3890. ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur);
  3891. cb(cur_gate, "ffn_moe_gate", il);
  3892. cur_gate = ggml_silu(ctx0, cur_gate);
  3893. cb(cur_gate, "ffn_moe_silu", il);
  3894. cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
  3895. cb(cur_expert, "ffn_moe_gate_par", il);
  3896. cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd]
  3897. cb(cur_expert, "ffn_moe_down", il);
  3898. cur_expert = ggml_mul(ctx0, cur_expert,
  3899. ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
  3900. cb(cur_expert, "ffn_moe_weighted", il);
  3901. if (i == 0) {
  3902. moe_out = cur_expert;
  3903. } else {
  3904. moe_out = ggml_add(ctx0, moe_out, cur_expert);
  3905. cb(moe_out, "ffn_moe_out", il);
  3906. }
  3907. }
  3908. cur = moe_out;
  3909. }
  3910. cur = ggml_add(ctx0, cur, ffn_inp);
  3911. cb(cur, "l_out", il);
  3912. // input for next layer
  3913. inpL = cur;
  3914. }
  3915. cur = inpL;
  3916. cur = llm_build_norm(ctx0, cur, hparams,
  3917. model.output_norm, NULL,
  3918. LLM_NORM_RMS, cb, -1);
  3919. cb(cur, "result_norm", -1);
  3920. // lm_head
  3921. cur = ggml_mul_mat(ctx0, model.output, cur);
  3922. cb(cur, "result_output", -1);
  3923. ggml_build_forward_expand(gf, cur);
  3924. return gf;
  3925. }
  3926. struct ggml_cgraph * build_baichuan() {
  3927. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
  3928. const int64_t n_embd_head = hparams.n_embd_head_v;
  3929. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  3930. GGML_ASSERT(n_embd_head == hparams.n_rot);
  3931. struct ggml_tensor * cur;
  3932. struct ggml_tensor * inpL;
  3933. inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
  3934. cb(inpL, "inp_embd", -1);
  3935. // inp_pos - contains the positions
  3936. struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
  3937. cb(inp_pos, "inp_pos", -1);
  3938. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  3939. struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
  3940. cb(KQ_mask, "KQ_mask", -1);
  3941. // shift the entire K-cache if needed
  3942. if (do_rope_shift) {
  3943. llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
  3944. }
  3945. for (int il = 0; il < n_layer; ++il) {
  3946. struct ggml_tensor * inpSA = inpL;
  3947. cur = llm_build_norm(ctx0, inpL, hparams,
  3948. model.layers[il].attn_norm, NULL,
  3949. LLM_NORM_RMS, cb, il);
  3950. cb(cur, "attn_norm", il);
  3951. // self-attention
  3952. {
  3953. struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
  3954. cb(Qcur, "Qcur", il);
  3955. struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
  3956. cb(Kcur, "Kcur", il);
  3957. struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
  3958. cb(Vcur, "Vcur", il);
  3959. switch (model.type) {
  3960. case MODEL_7B:
  3961. Qcur = ggml_rope_custom(
  3962. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
  3963. hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
  3964. ext_factor, attn_factor, beta_fast, beta_slow
  3965. );
  3966. Kcur = ggml_rope_custom(
  3967. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
  3968. hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
  3969. ext_factor, attn_factor, beta_fast, beta_slow
  3970. );
  3971. break;
  3972. case MODEL_13B:
  3973. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens);
  3974. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens);
  3975. break;
  3976. default:
  3977. GGML_ASSERT(false);
  3978. }
  3979. cb(Qcur, "Qcur", il);
  3980. cb(Kcur, "Kcur", il);
  3981. llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
  3982. // apply ALiBi for 13B model
  3983. const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
  3984. cur = llm_build_kqv(ctx0, model, hparams, kv_self,
  3985. model.layers[il].wo, NULL,
  3986. Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  3987. cb(cur, "kqv_out", il);
  3988. }
  3989. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  3990. cb(ffn_inp, "ffn_inp", il);
  3991. // feed-forward network
  3992. {
  3993. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  3994. model.layers[il].ffn_norm, NULL,
  3995. LLM_NORM_RMS, cb, il);
  3996. cb(cur, "ffn_norm", il);
  3997. cur = llm_build_ffn(ctx0, cur,
  3998. model.layers[il].ffn_up, NULL,
  3999. model.layers[il].ffn_gate, NULL,
  4000. model.layers[il].ffn_down, NULL,
  4001. NULL,
  4002. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  4003. cb(cur, "ffn_out", il);
  4004. }
  4005. cur = ggml_add(ctx0, cur, ffn_inp);
  4006. cb(cur, "l_out", il);
  4007. // input for next layer
  4008. inpL = cur;
  4009. }
  4010. cur = inpL;
  4011. cur = llm_build_norm(ctx0, cur, hparams,
  4012. model.output_norm, NULL,
  4013. LLM_NORM_RMS, cb, -1);
  4014. cb(cur, "result_norm", -1);
  4015. // lm_head
  4016. cur = ggml_mul_mat(ctx0, model.output, cur);
  4017. cb(cur, "result_output", -1);
  4018. ggml_build_forward_expand(gf, cur);
  4019. return gf;
  4020. }
  4021. struct ggml_cgraph * build_falcon() {
  4022. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
  4023. const int64_t n_embd_head = hparams.n_embd_head_v;
  4024. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  4025. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4026. GGML_ASSERT(n_embd_head == hparams.n_rot);
  4027. struct ggml_tensor * cur;
  4028. struct ggml_tensor * inpL;
  4029. inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
  4030. cb(inpL, "inp_embd", -1);
  4031. // inp_pos - contains the positions
  4032. struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
  4033. cb(inp_pos, "inp_pos", -1);
  4034. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  4035. struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
  4036. cb(KQ_mask, "KQ_mask", -1);
  4037. // shift the entire K-cache if needed
  4038. if (do_rope_shift) {
  4039. llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
  4040. }
  4041. for (int il = 0; il < n_layer; ++il) {
  4042. struct ggml_tensor * attn_norm;
  4043. attn_norm = llm_build_norm(ctx0, inpL, hparams,
  4044. model.layers[il].attn_norm,
  4045. model.layers[il].attn_norm_b,
  4046. LLM_NORM, cb, il);
  4047. cb(attn_norm, "attn_norm", il);
  4048. // self-attention
  4049. {
  4050. if (model.layers[il].attn_norm_2) {
  4051. // Falcon-40B
  4052. cur = llm_build_norm(ctx0, inpL, hparams,
  4053. model.layers[il].attn_norm_2,
  4054. model.layers[il].attn_norm_2_b,
  4055. LLM_NORM, cb, il);
  4056. cb(cur, "attn_norm_2", il);
  4057. } else {
  4058. cur = attn_norm;
  4059. }
  4060. cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
  4061. cb(cur, "wqkv", il);
  4062. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4063. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4064. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  4065. cb(Qcur, "Qcur", il);
  4066. cb(Kcur, "Kcur", il);
  4067. cb(Vcur, "Vcur", il);
  4068. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4069. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4070. // using mode = 2 for neox mode
  4071. Qcur = ggml_rope_custom(
  4072. ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
  4073. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  4074. );
  4075. cb(Qcur, "Qcur", il);
  4076. Kcur = ggml_rope_custom(
  4077. ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
  4078. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  4079. );
  4080. cb(Kcur, "Kcur", il);
  4081. llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
  4082. cur = llm_build_kqv(ctx0, model, hparams, kv_self,
  4083. model.layers[il].wo, NULL,
  4084. Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  4085. cb(cur, "kqv_out", il);
  4086. }
  4087. struct ggml_tensor * ffn_inp = cur;
  4088. // feed forward
  4089. {
  4090. cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result
  4091. model.layers[il].ffn_up, NULL,
  4092. NULL, NULL,
  4093. model.layers[il].ffn_down, NULL,
  4094. NULL,
  4095. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  4096. cb(cur, "ffn_out", il);
  4097. }
  4098. cur = ggml_add(ctx0, cur, ffn_inp);
  4099. cb(cur, "l_out", il);
  4100. cur = ggml_add(ctx0, cur, inpL);
  4101. cb(cur, "l_out", il);
  4102. // input for next layer
  4103. inpL = cur;
  4104. }
  4105. cur = inpL;
  4106. // norm
  4107. cur = llm_build_norm(ctx0, cur, hparams,
  4108. model.output_norm,
  4109. model.output_norm_b,
  4110. LLM_NORM, cb, -1);
  4111. cb(cur, "result_norm", -1);
  4112. cur = ggml_mul_mat(ctx0, model.output, cur);
  4113. cb(cur, "result_output", -1);
  4114. ggml_build_forward_expand(gf, cur);
  4115. return gf;
  4116. }
  4117. struct ggml_cgraph * build_starcoder() {
  4118. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
  4119. const int64_t n_embd_head = hparams.n_embd_head_v;
  4120. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  4121. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4122. struct ggml_tensor * cur;
  4123. struct ggml_tensor * pos;
  4124. struct ggml_tensor * inpL;
  4125. inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
  4126. cb(inpL, "inp_embd", -1);
  4127. // inp_pos - contains the positions
  4128. struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
  4129. cb(inp_pos, "inp_pos", -1);
  4130. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  4131. struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
  4132. cb(KQ_mask, "KQ_mask", -1);
  4133. pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
  4134. cb(pos, "pos_embd", -1);
  4135. inpL = ggml_add(ctx0, inpL, pos);
  4136. cb(inpL, "inpL", -1);
  4137. for (int il = 0; il < n_layer; ++il) {
  4138. cur = llm_build_norm(ctx0, inpL, hparams,
  4139. model.layers[il].attn_norm,
  4140. model.layers[il].attn_norm_b,
  4141. LLM_NORM, cb, il);
  4142. cb(cur, "attn_norm", il);
  4143. // self-attention
  4144. {
  4145. cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
  4146. cb(cur, "wqkv", il);
  4147. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  4148. cb(cur, "bqkv", il);
  4149. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4150. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4151. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  4152. cb(Qcur, "Qcur", il);
  4153. cb(Kcur, "Kcur", il);
  4154. cb(Vcur, "Vcur", il);
  4155. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4156. llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
  4157. cur = llm_build_kqv(ctx0, model, hparams, kv_self,
  4158. model.layers[il].wo, model.layers[il].bo,
  4159. Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  4160. cb(cur, "kqv_out", il);
  4161. }
  4162. // add the input
  4163. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  4164. cb(ffn_inp, "ffn_inp", il);
  4165. // FF
  4166. {
  4167. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  4168. model.layers[il].ffn_norm,
  4169. model.layers[il].ffn_norm_b,
  4170. LLM_NORM, cb, il);
  4171. cb(cur, "ffn_norm", il);
  4172. cur = llm_build_ffn(ctx0, cur,
  4173. model.layers[il].ffn_up, model.layers[il].ffn_up_b,
  4174. NULL, NULL,
  4175. model.layers[il].ffn_down, model.layers[il].ffn_down_b,
  4176. NULL,
  4177. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  4178. cb(cur, "ffn_out", il);
  4179. }
  4180. inpL = ggml_add(ctx0, cur, ffn_inp);
  4181. cb(inpL, "l_out", il);
  4182. }
  4183. cur = llm_build_norm(ctx0, inpL, hparams,
  4184. model.output_norm,
  4185. model.output_norm_b,
  4186. LLM_NORM, cb, -1);
  4187. cb(cur, "result_norm", -1);
  4188. cur = ggml_mul_mat(ctx0, model.output, cur);
  4189. cb(cur, "result_output", -1);
  4190. ggml_build_forward_expand(gf, cur);
  4191. return gf;
  4192. }
  4193. struct ggml_cgraph * build_persimmon() {
  4194. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
  4195. const int64_t n_embd_head = hparams.n_embd_head_v;
  4196. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4197. GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
  4198. struct ggml_tensor * cur;
  4199. struct ggml_tensor * inpL;
  4200. inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
  4201. cb(inpL, "inp_embd", -1);
  4202. // inp_pos - contains the positions
  4203. struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
  4204. cb(inp_pos, "inp_pos", -1);
  4205. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  4206. struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
  4207. cb(KQ_mask, "KQ_mask", -1);
  4208. if (do_rope_shift) {
  4209. llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
  4210. }
  4211. for (int il = 0; il < n_layer; ++il) {
  4212. struct ggml_tensor * residual = inpL;
  4213. cur = llm_build_norm(ctx0, inpL, hparams,
  4214. model.layers[il].attn_norm,
  4215. model.layers[il].attn_norm_b,
  4216. LLM_NORM, cb, il);
  4217. cb(cur, "attn_norm", il);
  4218. // self attention
  4219. {
  4220. cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
  4221. cb(cur, "wqkv", il);
  4222. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  4223. cb(cur, "bqkv", il);
  4224. // split qkv
  4225. GGML_ASSERT(n_head_kv == n_head);
  4226. struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
  4227. cb(tmpqkv, "tmpqkv", il);
  4228. struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
  4229. cb(tmpqkv_perm, "tmpqkv", il);
  4230. struct ggml_tensor * tmpq = ggml_view_3d(
  4231. ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
  4232. ggml_element_size(tmpqkv_perm) * n_embd_head,
  4233. ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
  4234. 0
  4235. );
  4236. cb(tmpq, "tmpq", il);
  4237. struct ggml_tensor * tmpk = ggml_view_3d(
  4238. ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
  4239. ggml_element_size(tmpqkv_perm) * n_embd_head,
  4240. ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
  4241. ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
  4242. );
  4243. cb(tmpk, "tmpk", il);
  4244. // Q/K Layernorm
  4245. tmpq = llm_build_norm(ctx0, tmpq, hparams,
  4246. model.layers[il].attn_q_norm,
  4247. model.layers[il].attn_q_norm_b,
  4248. LLM_NORM, cb, il);
  4249. cb(tmpq, "tmpq", il);
  4250. tmpk = llm_build_norm(ctx0, tmpk, hparams,
  4251. model.layers[il].attn_k_norm,
  4252. model.layers[il].attn_k_norm_b,
  4253. LLM_NORM, cb, il);
  4254. cb(tmpk, "tmpk", il);
  4255. // RoPE the first n_rot of q/k, pass the other half, and concat.
  4256. struct ggml_tensor * qrot = ggml_view_3d(
  4257. ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
  4258. ggml_element_size(tmpq) * n_embd_head,
  4259. ggml_element_size(tmpq) * n_embd_head * n_head,
  4260. 0
  4261. );
  4262. cb(qrot, "qrot", il);
  4263. struct ggml_tensor * krot = ggml_view_3d(
  4264. ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
  4265. ggml_element_size(tmpk) * n_embd_head,
  4266. ggml_element_size(tmpk) * n_embd_head * n_head,
  4267. 0
  4268. );
  4269. cb(krot, "krot", il);
  4270. // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
  4271. struct ggml_tensor * qpass = ggml_view_3d(
  4272. ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
  4273. ggml_element_size(tmpq) * n_embd_head,
  4274. ggml_element_size(tmpq) * n_embd_head * n_head,
  4275. ggml_element_size(tmpq) * hparams.n_rot
  4276. );
  4277. cb(qpass, "qpass", il);
  4278. struct ggml_tensor * kpass = ggml_view_3d(
  4279. ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
  4280. ggml_element_size(tmpk) * n_embd_head,
  4281. ggml_element_size(tmpk) * n_embd_head * n_head,
  4282. ggml_element_size(tmpk) * hparams.n_rot
  4283. );
  4284. cb(kpass, "kpass", il);
  4285. struct ggml_tensor * qrotated = ggml_rope_custom(
  4286. ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
  4287. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  4288. );
  4289. cb(qrotated, "qrotated", il);
  4290. struct ggml_tensor * krotated = ggml_rope_custom(
  4291. ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
  4292. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  4293. );
  4294. cb(krotated, "krotated", il);
  4295. // ggml currently only supports concatenation on dim=2
  4296. // so we need to permute qrot, qpass, concat, then permute back.
  4297. qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
  4298. cb(qrotated, "qrotated", il);
  4299. krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
  4300. cb(krotated, "krotated", il);
  4301. qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
  4302. cb(qpass, "qpass", il);
  4303. kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
  4304. cb(kpass, "kpass", il);
  4305. struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
  4306. cb(Qcur, "Qcur", il);
  4307. struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
  4308. cb(Kcur, "Kcur", il);
  4309. struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
  4310. cb(Q, "Q", il);
  4311. Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
  4312. cb(Kcur, "Kcur", il);
  4313. struct ggml_tensor * Vcur = ggml_view_3d(
  4314. ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
  4315. ggml_element_size(tmpqkv_perm) * n_embd_head,
  4316. ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
  4317. ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
  4318. );
  4319. cb(Vcur, "Vcur", il);
  4320. llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
  4321. // TODO: not tested, could be broken
  4322. cur = llm_build_kqv(ctx0, model, hparams, kv_self,
  4323. model.layers[il].wo, model.layers[il].bo,
  4324. Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  4325. cb(cur, "kqv_out", il);
  4326. }
  4327. struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
  4328. cb(ffn_inp, "ffn_inp", il);
  4329. // feed-forward network
  4330. {
  4331. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  4332. model.layers[il].ffn_norm,
  4333. model.layers[il].ffn_norm_b,
  4334. LLM_NORM, cb, il);
  4335. cb(cur, "ffn_norm", il);
  4336. cur = llm_build_ffn(ctx0, cur,
  4337. model.layers[il].ffn_up, model.layers[il].ffn_up_b,
  4338. NULL, NULL,
  4339. model.layers[il].ffn_down, model.layers[il].ffn_down_b,
  4340. NULL,
  4341. LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
  4342. cb(cur, "ffn_out", il);
  4343. }
  4344. cur = ggml_add(ctx0, cur, ffn_inp);
  4345. cb(cur, "l_out", il);
  4346. inpL = cur;
  4347. }
  4348. cur = inpL;
  4349. cur = llm_build_norm(ctx0, cur, hparams,
  4350. model.output_norm,
  4351. model.output_norm_b,
  4352. LLM_NORM, cb, -1);
  4353. cb(cur, "result_norm", -1);
  4354. cur = ggml_mul_mat(ctx0, model.output, cur);
  4355. cb(cur, "result_output", -1);
  4356. ggml_build_forward_expand(gf, cur);
  4357. return gf;
  4358. }
  4359. struct ggml_cgraph * build_refact() {
  4360. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
  4361. const int64_t n_embd_head = hparams.n_embd_head_v;
  4362. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4363. struct ggml_tensor * cur;
  4364. struct ggml_tensor * inpL;
  4365. inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
  4366. cb(inpL, "inp_embd", -1);
  4367. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  4368. struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
  4369. cb(KQ_mask, "KQ_mask", -1);
  4370. for (int il = 0; il < n_layer; ++il) {
  4371. struct ggml_tensor * inpSA = inpL;
  4372. cur = llm_build_norm(ctx0, inpL, hparams,
  4373. model.layers[il].attn_norm, NULL,
  4374. LLM_NORM_RMS, cb, il);
  4375. cb(cur, "attn_norm", il);
  4376. // self-attention
  4377. {
  4378. struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
  4379. cb(Qcur, "Qcur", il);
  4380. struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
  4381. cb(Kcur, "Kcur", il);
  4382. struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
  4383. cb(Vcur, "Vcur", il);
  4384. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4385. cb(Kcur, "Kcur", il);
  4386. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4387. cb(Qcur, "Qcur", il);
  4388. llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
  4389. cur = llm_build_kqv(ctx0, model, hparams, kv_self,
  4390. model.layers[il].wo, NULL,
  4391. Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  4392. cb(cur, "kqv_out", il);
  4393. }
  4394. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  4395. cb(ffn_inp, "ffn_inp", il);
  4396. // feed-forward network
  4397. {
  4398. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  4399. model.layers[il].ffn_norm, NULL,
  4400. LLM_NORM_RMS, cb, il);
  4401. cb(cur, "ffn_norm", il);
  4402. cur = llm_build_ffn(ctx0, cur,
  4403. model.layers[il].ffn_up, NULL,
  4404. model.layers[il].ffn_gate, NULL,
  4405. model.layers[il].ffn_down, NULL,
  4406. NULL,
  4407. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  4408. cb(cur, "ffn_out", il);
  4409. }
  4410. cur = ggml_add(ctx0, cur, ffn_inp);
  4411. cb(cur, "l_out", il);
  4412. // input for next layer
  4413. inpL = cur;
  4414. }
  4415. cur = inpL;
  4416. cur = llm_build_norm(ctx0, cur, hparams,
  4417. model.output_norm, NULL,
  4418. LLM_NORM_RMS, cb, -1);
  4419. cb(cur, "result_norm", -1);
  4420. // lm_head
  4421. cur = ggml_mul_mat(ctx0, model.output, cur);
  4422. cb(cur, "result_output", -1);
  4423. ggml_build_forward_expand(gf, cur);
  4424. return gf;
  4425. }
  4426. struct ggml_cgraph * build_bloom() {
  4427. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
  4428. const int64_t n_embd_head = hparams.n_embd_head_v;
  4429. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  4430. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4431. struct ggml_tensor * cur;
  4432. struct ggml_tensor * inpL;
  4433. inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
  4434. cb(inpL, "inp_embd", -1);
  4435. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  4436. struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
  4437. cb(KQ_mask, "KQ_mask", -1);
  4438. inpL = llm_build_norm(ctx0, inpL, hparams,
  4439. model.tok_norm,
  4440. model.tok_norm_b,
  4441. LLM_NORM, cb, -1);
  4442. cb(inpL, "inp_norm", -1);
  4443. for (int il = 0; il < n_layer; ++il) {
  4444. cur = llm_build_norm(ctx0, inpL, hparams,
  4445. model.layers[il].attn_norm,
  4446. model.layers[il].attn_norm_b,
  4447. LLM_NORM, cb, il);
  4448. cb(cur, "attn_norm", il);
  4449. // self-attention
  4450. {
  4451. cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
  4452. cb(cur, "wqkv", il);
  4453. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  4454. cb(cur, "bqkv", il);
  4455. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4456. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4457. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  4458. cb(Qcur, "Qcur", il);
  4459. cb(Kcur, "Kcur", il);
  4460. cb(Vcur, "Vcur", il);
  4461. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4462. llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
  4463. cur = llm_build_kqv(ctx0, model, hparams, kv_self,
  4464. model.layers[il].wo, model.layers[il].bo,
  4465. Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  4466. cb(cur, "kqv_out", il);
  4467. }
  4468. // Add the input
  4469. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  4470. cb(ffn_inp, "ffn_inp", il);
  4471. // FF
  4472. {
  4473. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  4474. model.layers[il].ffn_norm,
  4475. model.layers[il].ffn_norm_b,
  4476. LLM_NORM, cb, il);
  4477. cb(cur, "ffn_norm", il);
  4478. cur = llm_build_ffn(ctx0, cur,
  4479. model.layers[il].ffn_up, model.layers[il].ffn_up_b,
  4480. NULL, NULL,
  4481. model.layers[il].ffn_down, model.layers[il].ffn_down_b,
  4482. NULL,
  4483. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  4484. cb(cur, "ffn_out", il);
  4485. }
  4486. inpL = ggml_add(ctx0, cur, ffn_inp);
  4487. cb(inpL, "l_out", il);
  4488. }
  4489. cur = llm_build_norm(ctx0, inpL, hparams,
  4490. model.output_norm,
  4491. model.output_norm_b,
  4492. LLM_NORM, cb, -1);
  4493. cb(cur, "result_norm", -1);
  4494. cur = ggml_mul_mat(ctx0, model.output, cur);
  4495. cb(cur, "result_output", -1);
  4496. ggml_build_forward_expand(gf, cur);
  4497. return gf;
  4498. }
  4499. struct ggml_cgraph * build_mpt() {
  4500. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
  4501. const int64_t n_embd_head = hparams.n_embd_head_v;
  4502. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  4503. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4504. struct ggml_tensor * cur;
  4505. struct ggml_tensor * inpL;
  4506. inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
  4507. cb(inpL, "inp_embd", -1);
  4508. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  4509. struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
  4510. cb(KQ_mask, "KQ_mask", -1);
  4511. for (int il = 0; il < n_layer; ++il) {
  4512. struct ggml_tensor * attn_norm;
  4513. attn_norm = llm_build_norm(ctx0, inpL, hparams,
  4514. model.layers[il].attn_norm,
  4515. NULL,
  4516. LLM_NORM, cb, il);
  4517. cb(attn_norm, "attn_norm", il);
  4518. // self-attention
  4519. {
  4520. cur = attn_norm;
  4521. cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
  4522. cb(cur, "wqkv", il);
  4523. if (hparams.f_clamp_kqv > 0.0f) {
  4524. cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  4525. cb(cur, "wqkv_clamped", il);
  4526. }
  4527. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4528. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4529. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  4530. cb(Qcur, "Qcur", il);
  4531. cb(Kcur, "Kcur", il);
  4532. cb(Vcur, "Vcur", il);
  4533. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4534. llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
  4535. cur = llm_build_kqv(ctx0, model, hparams, kv_self,
  4536. model.layers[il].wo, NULL,
  4537. Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  4538. cb(cur, "kqv_out", il);
  4539. }
  4540. // Add the input
  4541. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  4542. cb(ffn_inp, "ffn_inp", il);
  4543. // feed forward
  4544. {
  4545. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  4546. model.layers[il].ffn_norm,
  4547. NULL,
  4548. LLM_NORM, cb, il);
  4549. cb(cur, "ffn_norm", il);
  4550. cur = llm_build_ffn(ctx0, cur,
  4551. model.layers[il].ffn_up, NULL,
  4552. NULL, NULL,
  4553. model.layers[il].ffn_down, NULL,
  4554. model.layers[il].ffn_act,
  4555. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  4556. cb(cur, "ffn_out", il);
  4557. }
  4558. cur = ggml_add(ctx0, cur, ffn_inp);
  4559. cb(cur, "l_out", il);
  4560. // input for next layer
  4561. inpL = cur;
  4562. }
  4563. cur = inpL;
  4564. cur = llm_build_norm(ctx0, cur, hparams,
  4565. model.output_norm,
  4566. NULL,
  4567. LLM_NORM, cb, -1);
  4568. cb(cur, "result_norm", -1);
  4569. cur = ggml_mul_mat(ctx0, model.output, cur);
  4570. cb(cur, "result_output", -1);
  4571. ggml_build_forward_expand(gf, cur);
  4572. return gf;
  4573. }
  4574. struct ggml_cgraph * build_stablelm() {
  4575. struct ggml_cgraph * gf = ggml_new_graph(ctx0);
  4576. const int64_t n_embd_head = hparams.n_embd_head_v;
  4577. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4578. GGML_ASSERT(n_embd_head == hparams.n_rot);
  4579. struct ggml_tensor * cur;
  4580. struct ggml_tensor * inpL;
  4581. inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
  4582. cb(inpL, "inp_embd", -1);
  4583. // inp_pos - contains the positions
  4584. struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
  4585. cb(inp_pos, "inp_pos", -1);
  4586. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  4587. struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
  4588. cb(KQ_mask, "KQ_mask", -1);
  4589. // shift the entire K-cache if needed
  4590. if (do_rope_shift) {
  4591. llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
  4592. }
  4593. for (int il = 0; il < n_layer; ++il) {
  4594. struct ggml_tensor * inpSA = inpL;
  4595. // norm
  4596. cur = llm_build_norm(ctx0, inpL, hparams,
  4597. model.layers[il].attn_norm,
  4598. model.layers[il].attn_norm_b,
  4599. LLM_NORM, cb, il);
  4600. cb(cur, "attn_norm", il);
  4601. // self-attention
  4602. {
  4603. // compute Q and K and RoPE them
  4604. struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
  4605. cb(Qcur, "Qcur", il);
  4606. struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
  4607. cb(Kcur, "Kcur", il);
  4608. struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
  4609. cb(Vcur, "Vcur", il);
  4610. Qcur = ggml_rope_custom(
  4611. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
  4612. hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
  4613. ext_factor, attn_factor, beta_fast, beta_slow
  4614. );
  4615. cb(Qcur, "Qcur", il);
  4616. Kcur = ggml_rope_custom(
  4617. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
  4618. hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
  4619. ext_factor, attn_factor, beta_fast, beta_slow
  4620. );
  4621. cb(Kcur, "Kcur", il);
  4622. llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
  4623. cur = llm_build_kqv(ctx0, model, hparams, kv_self,
  4624. model.layers[il].wo, NULL,
  4625. Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  4626. cb(cur, "kqv_out", il);
  4627. }
  4628. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  4629. cb(ffn_inp, "ffn_inp", il);
  4630. // feed-forward network
  4631. {
  4632. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  4633. model.layers[il].ffn_norm,
  4634. model.layers[il].ffn_norm_b,
  4635. LLM_NORM, cb, il);
  4636. cb(cur, "ffn_norm", il);
  4637. cur = llm_build_ffn(ctx0, cur,
  4638. model.layers[il].ffn_up, NULL,
  4639. model.layers[il].ffn_gate, NULL,
  4640. model.layers[il].ffn_down, NULL,
  4641. NULL,
  4642. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  4643. cb(cur, "ffn_out", il);
  4644. }
  4645. cur = ggml_add(ctx0, cur, ffn_inp);
  4646. cb(cur, "l_out", il);
  4647. // input for next layer
  4648. inpL = cur;
  4649. }
  4650. cur = inpL;
  4651. cur = llm_build_norm(ctx0, cur, hparams,
  4652. model.output_norm,
  4653. model.output_norm_b,
  4654. LLM_NORM, cb, -1);
  4655. cb(cur, "result_norm", -1);
  4656. // lm_head
  4657. cur = ggml_mul_mat(ctx0, model.output, cur);
  4658. cb(cur, "result_output", -1);
  4659. ggml_build_forward_expand(gf, cur);
  4660. return gf;
  4661. }
  4662. struct ggml_cgraph * build_qwen() {
  4663. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
  4664. const int64_t n_embd_head = hparams.n_embd_head_v;
  4665. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4666. struct ggml_tensor * cur;
  4667. struct ggml_tensor * inpL;
  4668. inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
  4669. cb(inpL, "inp_embd", -1);
  4670. // inp_pos - contains the positions
  4671. struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
  4672. cb(inp_pos, "inp_pos", -1);
  4673. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  4674. struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
  4675. cb(KQ_mask, "KQ_mask", -1);
  4676. // shift the entire K-cache if needed
  4677. if (do_rope_shift) {
  4678. llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
  4679. }
  4680. for (int il = 0; il < n_layer; ++il) {
  4681. struct ggml_tensor * inpSA = inpL;
  4682. cur = llm_build_norm(ctx0, inpL, hparams,
  4683. model.layers[il].attn_norm, NULL,
  4684. LLM_NORM_RMS, cb, il);
  4685. cb(cur, "attn_norm", il);
  4686. // self-attention
  4687. {
  4688. cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
  4689. cb(cur, "wqkv", il);
  4690. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  4691. cb(cur, "bqkv", il);
  4692. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4693. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4694. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
  4695. cb(Qcur, "Qcur", il);
  4696. cb(Kcur, "Kcur", il);
  4697. cb(Vcur, "Vcur", il);
  4698. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4699. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4700. // using mode = 2 for neox mode
  4701. Qcur = ggml_rope_custom(
  4702. ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
  4703. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  4704. );
  4705. cb(Qcur, "Qcur", il);
  4706. Kcur = ggml_rope_custom(
  4707. ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
  4708. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  4709. );
  4710. cb(Kcur, "Kcur", il);
  4711. llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
  4712. cur = llm_build_kqv(ctx0, model, hparams, kv_self,
  4713. model.layers[il].wo, NULL,
  4714. Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  4715. cb(cur, "kqv_out", il);
  4716. }
  4717. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  4718. cb(ffn_inp, "ffn_inp", il);
  4719. // feed-forward forward
  4720. {
  4721. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  4722. model.layers[il].ffn_norm, NULL,
  4723. LLM_NORM_RMS, cb, il);
  4724. cb(cur, "ffn_norm", il);
  4725. cur = llm_build_ffn(ctx0, cur,
  4726. model.layers[il].ffn_up, NULL,
  4727. model.layers[il].ffn_gate, NULL,
  4728. model.layers[il].ffn_down, NULL,
  4729. NULL,
  4730. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  4731. cb(cur, "ffn_out", il);
  4732. }
  4733. cur = ggml_add(ctx0, cur, ffn_inp);
  4734. cb(cur, "l_out", il);
  4735. // input for next layer
  4736. inpL = cur;
  4737. }
  4738. cur = inpL;
  4739. cur = llm_build_norm(ctx0, cur, hparams,
  4740. model.output_norm, NULL,
  4741. LLM_NORM_RMS, cb, -1);
  4742. cb(cur, "result_norm", -1);
  4743. // lm_head
  4744. cur = ggml_mul_mat(ctx0, model.output, cur);
  4745. cb(cur, "result_output", -1);
  4746. ggml_build_forward_expand(gf, cur);
  4747. return gf;
  4748. }
  4749. struct ggml_cgraph * build_phi2() {
  4750. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
  4751. const int64_t n_embd_head = hparams.n_embd_head_v;
  4752. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  4753. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4754. struct ggml_tensor * cur;
  4755. struct ggml_tensor * attn_norm_output;
  4756. struct ggml_tensor * ffn_output;
  4757. struct ggml_tensor * inpL;
  4758. inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
  4759. cb(inpL, "inp_embd", -1);
  4760. // inp_pos - contains the positions
  4761. struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
  4762. cb(inp_pos, "inp_pos", -1);
  4763. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  4764. struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
  4765. cb(KQ_mask, "KQ_mask", -1);
  4766. // shift the entire K-cache if needed
  4767. if (do_rope_shift) {
  4768. llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
  4769. }
  4770. for (int il = 0; il < n_layer; ++il) {
  4771. attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
  4772. model.layers[il].attn_norm,
  4773. model.layers[il].attn_norm_b,
  4774. LLM_NORM, cb, il);
  4775. cb(attn_norm_output, "attn_norm", il);
  4776. // self-attention
  4777. {
  4778. cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
  4779. cb(cur, "wqkv", il);
  4780. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  4781. cb(cur, "bqkv", il);
  4782. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4783. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4784. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  4785. cb(Qcur, "Qcur", il);
  4786. cb(Kcur, "Kcur", il);
  4787. cb(Vcur, "Vcur", il);
  4788. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4789. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4790. Qcur = ggml_rope_custom(
  4791. ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
  4792. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  4793. );
  4794. cb(Qcur, "Qcur", il);
  4795. // with phi2, we scale the Q to avoid precision issues
  4796. // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
  4797. Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
  4798. cb(Qcur, "Qcur", il);
  4799. Kcur = ggml_rope_custom(
  4800. ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
  4801. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  4802. );
  4803. cb(Kcur, "Kcur", il);
  4804. llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
  4805. cur = llm_build_kqv(ctx0, model, hparams, kv_self,
  4806. model.layers[il].wo, model.layers[il].bo,
  4807. Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il);
  4808. cb(cur, "kqv_out", il);
  4809. }
  4810. // FF
  4811. {
  4812. ffn_output = llm_build_ffn(ctx0, attn_norm_output,
  4813. model.layers[il].ffn_up, model.layers[il].ffn_up_b,
  4814. NULL, NULL,
  4815. model.layers[il].ffn_down, model.layers[il].ffn_down_b,
  4816. NULL,
  4817. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  4818. cb(ffn_output, "ffn_out", il);
  4819. }
  4820. cur = ggml_add(ctx0, cur, ffn_output);
  4821. cb(cur, "l_out", il);
  4822. cur = ggml_add(ctx0, cur, inpL);
  4823. cb(cur, "l_out", il);
  4824. inpL = cur;
  4825. }
  4826. cur = llm_build_norm(ctx0, inpL, hparams,
  4827. model.output_norm,
  4828. model.output_norm_b,
  4829. LLM_NORM, cb, -1);
  4830. cb(cur, "result_norm", -1);
  4831. cur = ggml_mul_mat(ctx0, model.output, cur);
  4832. cb(cur, "result_output_no_bias", -1);
  4833. cur = ggml_add(ctx0, cur, model.output_b);
  4834. cb(cur, "result_output", -1);
  4835. ggml_build_forward_expand(gf, cur);
  4836. return gf;
  4837. }
  4838. struct ggml_cgraph * build_plamo() {
  4839. struct ggml_cgraph * gf = ggml_new_graph(ctx0);
  4840. const int64_t n_embd_head = hparams.n_embd_head_v;
  4841. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4842. GGML_ASSERT(n_embd_head == hparams.n_rot);
  4843. struct ggml_tensor * cur;
  4844. struct ggml_tensor * inpL;
  4845. inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
  4846. cb(inpL, "inp_embd", -1);
  4847. // inp_pos - contains the positions
  4848. struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
  4849. cb(inp_pos, "inp_pos", -1);
  4850. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  4851. struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
  4852. cb(KQ_mask, "KQ_mask", -1);
  4853. // shift the entire K-cache if needed
  4854. if (do_rope_shift) {
  4855. llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
  4856. }
  4857. for (int il = 0; il < n_layer; ++il) {
  4858. // norm
  4859. cur = llm_build_norm(ctx0, inpL, hparams,
  4860. model.layers[il].attn_norm, NULL,
  4861. LLM_NORM_RMS, cb, il);
  4862. cb(cur, "attn_norm", il);
  4863. struct ggml_tensor * attention_norm = cur;
  4864. // self-attention
  4865. {
  4866. // compute Q and K and RoPE them
  4867. struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
  4868. cb(Qcur, "Qcur", il);
  4869. struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
  4870. cb(Kcur, "Kcur", il);
  4871. struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
  4872. cb(Vcur, "Vcur", il);
  4873. Qcur = ggml_rope_custom(
  4874. ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head, n_tokens), inp_pos,
  4875. n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
  4876. ext_factor, attn_factor, beta_fast, beta_slow);
  4877. cb(Qcur, "Qcur", il);
  4878. Kcur = ggml_rope_custom(
  4879. ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos,
  4880. n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
  4881. ext_factor, attn_factor, beta_fast, beta_slow);
  4882. cb(Kcur, "Kcur", il);
  4883. llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
  4884. cur = llm_build_kqv(ctx0, model, hparams, kv_self,
  4885. model.layers[il].wo, NULL,
  4886. Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  4887. cb(cur, "kqv_out", il);
  4888. }
  4889. struct ggml_tensor * sa_out = cur;
  4890. cur = attention_norm;
  4891. // feed-forward network
  4892. {
  4893. cur = llm_build_ffn(ctx0, cur,
  4894. model.layers[il].ffn_up, NULL,
  4895. model.layers[il].ffn_gate, NULL,
  4896. model.layers[il].ffn_down, NULL,
  4897. NULL,
  4898. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  4899. cb(cur, "ffn_out", il);
  4900. }
  4901. cur = ggml_add(ctx0, cur, sa_out);
  4902. cb(cur, "l_out", il);
  4903. cur = ggml_add(ctx0, cur, inpL);
  4904. cb(cur, "l_out", il);
  4905. // input for next layer
  4906. inpL = cur;
  4907. }
  4908. cur = inpL;
  4909. cur = llm_build_norm(ctx0, cur, hparams,
  4910. model.output_norm, NULL,
  4911. LLM_NORM_RMS, cb, -1);
  4912. cb(cur, "result_norm", -1);
  4913. // lm_head
  4914. cur = ggml_mul_mat(ctx0, model.output, cur);
  4915. cb(cur, "result_output", -1);
  4916. ggml_build_forward_expand(gf, cur);
  4917. return gf;
  4918. }
  4919. struct ggml_cgraph * build_gpt2() {
  4920. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
  4921. const int64_t n_embd_head = hparams.n_embd_head_v;
  4922. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  4923. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4924. struct ggml_tensor * cur;
  4925. struct ggml_tensor * pos;
  4926. struct ggml_tensor * inpL;
  4927. inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
  4928. cb(inpL, "inp_embd", -1);
  4929. // inp_pos - contains the positions
  4930. struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
  4931. cb(inp_pos, "inp_pos", -1);
  4932. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  4933. struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
  4934. cb(KQ_mask, "KQ_mask", -1);
  4935. pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
  4936. cb(pos, "pos_embd", -1);
  4937. inpL = ggml_add(ctx0, inpL, pos);
  4938. cb(inpL, "inpL", -1);
  4939. for (int il = 0; il < n_layer; ++il) {
  4940. cur = llm_build_norm(ctx0, inpL, hparams,
  4941. model.layers[il].attn_norm,
  4942. model.layers[il].attn_norm_b,
  4943. LLM_NORM, cb, il);
  4944. cb(cur, "attn_norm", il);
  4945. // self-attention
  4946. {
  4947. cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
  4948. cb(cur, "wqkv", il);
  4949. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  4950. cb(cur, "bqkv", il);
  4951. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4952. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4953. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  4954. cb(Qcur, "Qcur", il);
  4955. cb(Kcur, "Kcur", il);
  4956. cb(Vcur, "Vcur", il);
  4957. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4958. llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
  4959. cur = llm_build_kqv(ctx0, model, hparams, kv_self,
  4960. model.layers[il].wo, model.layers[il].bo,
  4961. Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  4962. cb(cur, "kqv_out", il);
  4963. }
  4964. // add the input
  4965. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  4966. cb(ffn_inp, "ffn_inp", il);
  4967. // FF
  4968. {
  4969. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  4970. model.layers[il].ffn_norm,
  4971. model.layers[il].ffn_norm_b,
  4972. LLM_NORM, cb, il);
  4973. cb(cur, "ffn_norm", il);
  4974. cur = llm_build_ffn(ctx0, cur,
  4975. model.layers[il].ffn_up, model.layers[il].ffn_up_b,
  4976. NULL, NULL,
  4977. model.layers[il].ffn_down, model.layers[il].ffn_down_b,
  4978. NULL,
  4979. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  4980. cb(cur, "ffn_out", il);
  4981. }
  4982. inpL = ggml_add(ctx0, cur, ffn_inp);
  4983. cb(inpL, "l_out", il);
  4984. }
  4985. cur = llm_build_norm(ctx0, inpL, hparams,
  4986. model.output_norm,
  4987. model.output_norm_b,
  4988. LLM_NORM, cb, -1);
  4989. cb(cur, "result_norm", -1);
  4990. cur = ggml_mul_mat(ctx0, model.output, cur);
  4991. cb(cur, "result_output", -1);
  4992. ggml_build_forward_expand(gf, cur);
  4993. return gf;
  4994. }
  4995. };
  4996. //
  4997. // tensor offloading helpers
  4998. //
  4999. // TODO: will be removed with backend v2
  5000. enum llm_offload_func_e {
  5001. OFFLOAD_FUNC_NOP,
  5002. OFFLOAD_FUNC,
  5003. OFFLOAD_FUNC_FRC, // force offload
  5004. OFFLOAD_FUNC_KQV,
  5005. OFFLOAD_FUNC_NR,
  5006. OFFLOAD_FUNC_EMB, // embeddings
  5007. OFFLOAD_FUNC_OUT,
  5008. };
  5009. // TODO: will be removed with backend v2
  5010. struct llm_offload_trie {
  5011. struct node {
  5012. ~node() {
  5013. for (int i = 0; i < 256; ++i) {
  5014. if (children[i]) {
  5015. delete children[i];
  5016. }
  5017. }
  5018. }
  5019. node * children[256] = { nullptr };
  5020. llm_offload_func_e func = OFFLOAD_FUNC_NOP;
  5021. };
  5022. llm_offload_trie() {
  5023. root = new node;
  5024. }
  5025. llm_offload_trie(const std::unordered_map<const char *, llm_offload_func_e> & map) {
  5026. root = new node;
  5027. for (const auto & kv : map) {
  5028. add(kv.first, kv.second);
  5029. }
  5030. }
  5031. ~llm_offload_trie() {
  5032. delete root;
  5033. }
  5034. void add(const char * name, llm_offload_func_e func) {
  5035. node * cur = root;
  5036. for (int i = 0; ; ++i) {
  5037. const uint8_t c = name[i];
  5038. if (!c) {
  5039. break;
  5040. }
  5041. if (!cur->children[c]) {
  5042. cur->children[c] = new node;
  5043. }
  5044. cur = cur->children[c];
  5045. }
  5046. cur->func = func;
  5047. }
  5048. llm_offload_func_e find(const char * name) const {
  5049. const node * cur = root;
  5050. for (int i = 0; ; ++i) {
  5051. const uint8_t c = name[i];
  5052. if (!c) {
  5053. break;
  5054. }
  5055. if (!cur->children[c]) {
  5056. return OFFLOAD_FUNC_NOP;
  5057. }
  5058. cur = cur->children[c];
  5059. }
  5060. return cur->func;
  5061. }
  5062. node * root = nullptr;
  5063. };
  5064. // TODO: will be removed with backend v2
  5065. static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map = {
  5066. //{ "inp_tokens", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
  5067. //{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
  5068. { "pos_embd", OFFLOAD_FUNC_NR },
  5069. { "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
  5070. { "KQ_mask", OFFLOAD_FUNC_FRC },
  5071. { "K_shift", OFFLOAD_FUNC_FRC },
  5072. { "K_shifted", OFFLOAD_FUNC },
  5073. { "inp_norm", OFFLOAD_FUNC_NR },
  5074. { "inp_norm_w", OFFLOAD_FUNC_NR },
  5075. { "inp_norm_wb", OFFLOAD_FUNC_NR },
  5076. { "norm", OFFLOAD_FUNC },
  5077. { "norm_w", OFFLOAD_FUNC },
  5078. { "norm_wb", OFFLOAD_FUNC },
  5079. { "attn_norm", OFFLOAD_FUNC },
  5080. { "attn_norm_2", OFFLOAD_FUNC },
  5081. { "wqkv", OFFLOAD_FUNC_KQV },
  5082. { "bqkv", OFFLOAD_FUNC_KQV },
  5083. { "wqkv_clamped", OFFLOAD_FUNC_KQV },
  5084. { "tmpk", OFFLOAD_FUNC_KQV },
  5085. { "tmpq", OFFLOAD_FUNC_KQV },
  5086. { "tmpv", OFFLOAD_FUNC_KQV },
  5087. { "Kcur", OFFLOAD_FUNC_KQV },
  5088. { "Qcur", OFFLOAD_FUNC_KQV },
  5089. { "Vcur", OFFLOAD_FUNC_KQV },
  5090. { "krot", OFFLOAD_FUNC_KQV },
  5091. { "qrot", OFFLOAD_FUNC_KQV },
  5092. { "kpass", OFFLOAD_FUNC_KQV },
  5093. { "qpass", OFFLOAD_FUNC_KQV },
  5094. { "krotated", OFFLOAD_FUNC_KQV },
  5095. { "qrotated", OFFLOAD_FUNC_KQV },
  5096. { "q", OFFLOAD_FUNC_KQV },
  5097. { "k", OFFLOAD_FUNC_KQV },
  5098. { "kq", OFFLOAD_FUNC_KQV },
  5099. { "kq_scaled", OFFLOAD_FUNC_KQV },
  5100. { "kq_scaled_alibi", OFFLOAD_FUNC_KQV },
  5101. { "kq_masked", OFFLOAD_FUNC_KQV },
  5102. { "kq_soft_max", OFFLOAD_FUNC_KQV },
  5103. { "kq_soft_max_ext", OFFLOAD_FUNC_KQV },
  5104. { "v", OFFLOAD_FUNC_KQV },
  5105. { "kqv", OFFLOAD_FUNC_KQV },
  5106. { "kqv_merged", OFFLOAD_FUNC_KQV },
  5107. { "kqv_merged_cont", OFFLOAD_FUNC_KQV },
  5108. { "kqv_wo", OFFLOAD_FUNC_KQV },
  5109. { "kqv_out", OFFLOAD_FUNC_KQV },
  5110. { "ffn_inp", OFFLOAD_FUNC },
  5111. { "ffn_norm", OFFLOAD_FUNC },
  5112. { "ffn_up", OFFLOAD_FUNC },
  5113. { "ffn_up_b", OFFLOAD_FUNC },
  5114. { "ffn_gate", OFFLOAD_FUNC },
  5115. { "ffn_gate_b", OFFLOAD_FUNC },
  5116. { "ffn_gate_par", OFFLOAD_FUNC },
  5117. { "ffn_act", OFFLOAD_FUNC },
  5118. { "ffn_down", OFFLOAD_FUNC },
  5119. { "ffn_down_b", OFFLOAD_FUNC },
  5120. { "ffn_out", OFFLOAD_FUNC },
  5121. { "ffn_silu", OFFLOAD_FUNC },
  5122. { "ffn_gelu", OFFLOAD_FUNC },
  5123. { "ffn_relu", OFFLOAD_FUNC },
  5124. { "ffn_sqr(relu)", OFFLOAD_FUNC },
  5125. { "ffn_moe_logits", OFFLOAD_FUNC },
  5126. { "ffn_moe_probs", OFFLOAD_FUNC },
  5127. { "ffn_moe_argsort", OFFLOAD_FUNC },
  5128. { "ffn_moe_weights", OFFLOAD_FUNC },
  5129. { "ffn_moe_weights_sum", OFFLOAD_FUNC },
  5130. { "ffn_moe_weights_norm", OFFLOAD_FUNC },
  5131. { "ffn_moe_weighted", OFFLOAD_FUNC },
  5132. { "ffn_moe_up", OFFLOAD_FUNC },
  5133. { "ffn_moe_gate", OFFLOAD_FUNC },
  5134. { "ffn_moe_silu", OFFLOAD_FUNC },
  5135. { "ffn_moe_gate_par", OFFLOAD_FUNC },
  5136. { "ffn_moe_down", OFFLOAD_FUNC },
  5137. { "ffn_moe_out", OFFLOAD_FUNC },
  5138. { "l_out", OFFLOAD_FUNC },
  5139. { "result_norm", OFFLOAD_FUNC_EMB },
  5140. { "result_output_no_bias", OFFLOAD_FUNC_EMB },
  5141. { "result_output", OFFLOAD_FUNC_OUT },
  5142. };
  5143. static llm_offload_trie k_offload_func_trie(k_offload_map);
  5144. static struct ggml_cgraph * llama_build_graph(
  5145. llama_context & lctx,
  5146. const llama_batch & batch) {
  5147. const auto & model = lctx.model;
  5148. // check if we should build the worst-case graph (for memory measurement)
  5149. const bool worst_case = ggml_allocr_is_measure(lctx.alloc);
  5150. // keep track of the input that has already been allocated
  5151. bool alloc_inp_tokens = false;
  5152. bool alloc_inp_embd = false;
  5153. bool alloc_inp_pos = false;
  5154. bool alloc_inp_KQ_mask = false;
  5155. bool alloc_inp_K_shift = false;
  5156. #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
  5157. const bool do_offload = true;
  5158. #else
  5159. const bool do_offload = true; // TODO: set to false after finishing refactoring
  5160. #endif
  5161. int n_non_view = 0; // number of non-view tensors that have been processed by the callback
  5162. // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
  5163. // TODO: will be removed with backend v2
  5164. llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
  5165. if (il >= 0) {
  5166. ggml_format_name(cur, "%s-%d", name, il);
  5167. } else {
  5168. ggml_set_name(cur, name);
  5169. }
  5170. //
  5171. // allocate input tensors and set input data
  5172. //
  5173. // TODO: will be removed with backend v2
  5174. if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) {
  5175. ggml_allocr_alloc(lctx.alloc, cur);
  5176. if (!ggml_allocr_is_measure(lctx.alloc) && batch.token) {
  5177. const int64_t n_tokens = cur->ne[0];
  5178. ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur));
  5179. }
  5180. alloc_inp_tokens = true;
  5181. }
  5182. if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0) {
  5183. ggml_allocr_alloc(lctx.alloc, cur);
  5184. if (!ggml_allocr_is_measure(lctx.alloc) && batch.embd) {
  5185. const int64_t n_embd = cur->ne[0];
  5186. const int64_t n_tokens = cur->ne[1];
  5187. ggml_backend_tensor_set(cur, batch.embd, 0, n_tokens*n_embd*ggml_element_size(cur));
  5188. }
  5189. alloc_inp_embd = true;
  5190. }
  5191. if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) {
  5192. ggml_allocr_alloc(lctx.alloc, cur);
  5193. if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) {
  5194. const int64_t n_tokens = cur->ne[0];
  5195. static_assert(std::is_same<llama_pos, int32_t>::value, "llama_pos must be int32_t");
  5196. ggml_backend_tensor_set(cur, batch.pos, 0, n_tokens*ggml_element_size(cur));
  5197. }
  5198. alloc_inp_pos = true;
  5199. }
  5200. if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
  5201. ggml_allocr_alloc(lctx.alloc, cur);
  5202. if (!ggml_allocr_is_measure(lctx.alloc)) {
  5203. const int64_t n_kv = cur->ne[0];
  5204. const int64_t n_tokens = cur->ne[1];
  5205. float * data;
  5206. if (ggml_backend_buffer_is_host(cur->buffer)) {
  5207. data = (float *) cur->data;
  5208. } else {
  5209. lctx.buf_copy.resize(ggml_nbytes(cur));
  5210. data = (float *) lctx.buf_copy.data();
  5211. }
  5212. for (int h = 0; h < 1; ++h) {
  5213. for (int j = 0; j < n_tokens; ++j) {
  5214. const llama_pos pos = batch.pos[j];
  5215. const llama_seq_id seq_id = batch.seq_id[j][0];
  5216. for (int i = 0; i < n_kv; ++i) {
  5217. float f;
  5218. if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
  5219. f = -INFINITY;
  5220. } else {
  5221. f = 0;
  5222. }
  5223. data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
  5224. }
  5225. }
  5226. }
  5227. if (data != cur->data) {
  5228. ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
  5229. }
  5230. }
  5231. alloc_inp_KQ_mask = true;
  5232. }
  5233. if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) {
  5234. ggml_allocr_alloc(lctx.alloc, cur);
  5235. if (!ggml_allocr_is_measure(lctx.alloc)) {
  5236. const int64_t n_ctx = cur->ne[0];
  5237. int32_t * data;
  5238. if (ggml_backend_buffer_is_host(cur->buffer)) {
  5239. data = (int32_t *) cur->data;
  5240. } else {
  5241. lctx.buf_copy.resize(ggml_nbytes(cur));
  5242. data = (int32_t *) lctx.buf_copy.data();
  5243. }
  5244. for (int i = 0; i < n_ctx; ++i) {
  5245. data[i] = lctx.kv_self.cells[i].delta;
  5246. }
  5247. if (data != cur->data) {
  5248. ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
  5249. }
  5250. }
  5251. alloc_inp_K_shift = true;
  5252. }
  5253. // view tensors are not processed further
  5254. if (cur->view_src != nullptr) {
  5255. return;
  5256. }
  5257. if (cur->op != GGML_OP_NONE) {
  5258. n_non_view++;
  5259. }
  5260. //
  5261. // offload layers
  5262. //
  5263. // TODO: will be removed with backend v2
  5264. //#define LLAMA_OFFLOAD_DEBUG
  5265. if (!do_offload) {
  5266. return;
  5267. }
  5268. const int n_layer = model.hparams.n_layer;
  5269. const int n_gpu_layers = model.n_gpu_layers;
  5270. const int i_gpu_start = n_layer - n_gpu_layers;
  5271. // should we offload the final norm? yes if we are not computing embeddings
  5272. const bool offload_emb = lctx.embedding.empty();
  5273. static const std::unordered_map<llm_offload_func_e, std::string, std::hash<int>> k_offload_func_name = {
  5274. { OFFLOAD_FUNC_NOP, "CPU" },
  5275. { OFFLOAD_FUNC_OUT, "CPU" },
  5276. #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
  5277. { OFFLOAD_FUNC, "GPU (CUDA)" },
  5278. { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
  5279. { OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
  5280. { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" },
  5281. { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
  5282. #else
  5283. { OFFLOAD_FUNC, "CPU" },
  5284. { OFFLOAD_FUNC_FRC, "CPU" },
  5285. { OFFLOAD_FUNC_KQV, "CPU" },
  5286. { OFFLOAD_FUNC_NR, "CPU" },
  5287. { OFFLOAD_FUNC_EMB, "CPU" },
  5288. #endif // GGML_USE_CUBLAS
  5289. };
  5290. // check the global map for what offload function to use for this tensor
  5291. llm_offload_func_e func_e = k_offload_func_trie.find(name);
  5292. if (func_e == OFFLOAD_FUNC_NOP) {
  5293. #ifdef LLAMA_OFFLOAD_DEBUG
  5294. // if a tensor hasn't been offloaded, we warn the user
  5295. if (worst_case) {
  5296. LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__,
  5297. cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837");
  5298. }
  5299. #endif
  5300. return;
  5301. }
  5302. // count the number of layers and respect the provided n_gpu_layers
  5303. switch (func_e) {
  5304. case OFFLOAD_FUNC_NOP:
  5305. case OFFLOAD_FUNC_OUT:
  5306. break;
  5307. case OFFLOAD_FUNC:
  5308. if (n_gpu_layers < n_layer) {
  5309. if (il < i_gpu_start) {
  5310. func_e = OFFLOAD_FUNC_NOP;
  5311. }
  5312. }
  5313. break;
  5314. case OFFLOAD_FUNC_FRC:
  5315. if (!lctx.cparams.offload_kqv) {
  5316. func_e = OFFLOAD_FUNC_NOP;
  5317. } break;
  5318. case OFFLOAD_FUNC_KQV:
  5319. if (!lctx.cparams.offload_kqv) {
  5320. func_e = OFFLOAD_FUNC_NOP;
  5321. } else {
  5322. if (n_gpu_layers < n_layer) {
  5323. if (il < i_gpu_start) {
  5324. func_e = OFFLOAD_FUNC_NOP;
  5325. }
  5326. }
  5327. }
  5328. break;
  5329. case OFFLOAD_FUNC_NR:
  5330. if (n_gpu_layers <= n_layer + 0) {
  5331. func_e = OFFLOAD_FUNC_NOP;
  5332. }
  5333. break;
  5334. case OFFLOAD_FUNC_EMB:
  5335. if (!offload_emb || n_gpu_layers < n_layer) {
  5336. func_e = OFFLOAD_FUNC_NOP;
  5337. }
  5338. break;
  5339. default: GGML_ASSERT(false);
  5340. }
  5341. offload_func_t func = ggml_offload_nop;
  5342. // this is needed for compatibility with Metal for example
  5343. #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
  5344. static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc;
  5345. #else
  5346. static offload_func_t ggml_offload_gpu = ggml_offload_nop;
  5347. #endif
  5348. switch (func_e) {
  5349. case OFFLOAD_FUNC_NOP:
  5350. case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break;
  5351. case OFFLOAD_FUNC:
  5352. case OFFLOAD_FUNC_KQV:
  5353. case OFFLOAD_FUNC_FRC:
  5354. case OFFLOAD_FUNC_NR:
  5355. case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break;
  5356. default: GGML_ASSERT(false);
  5357. }
  5358. // apply offload function to the tensor
  5359. func(cur);
  5360. #ifdef LLAMA_OFFLOAD_DEBUG
  5361. if (worst_case) {
  5362. LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str());
  5363. }
  5364. #endif
  5365. };
  5366. struct ggml_cgraph * result = NULL;
  5367. struct llm_build_context llm(lctx, batch, cb, worst_case);
  5368. llm.init();
  5369. switch (model.arch) {
  5370. case LLM_ARCH_LLAMA:
  5371. {
  5372. result = llm.build_llama();
  5373. } break;
  5374. case LLM_ARCH_BAICHUAN:
  5375. {
  5376. result = llm.build_baichuan();
  5377. } break;
  5378. case LLM_ARCH_FALCON:
  5379. {
  5380. result = llm.build_falcon();
  5381. } break;
  5382. case LLM_ARCH_STARCODER:
  5383. {
  5384. result = llm.build_starcoder();
  5385. } break;
  5386. case LLM_ARCH_PERSIMMON:
  5387. {
  5388. result = llm.build_persimmon();
  5389. } break;
  5390. case LLM_ARCH_REFACT:
  5391. {
  5392. result = llm.build_refact();
  5393. } break;
  5394. case LLM_ARCH_BLOOM:
  5395. {
  5396. result = llm.build_bloom();
  5397. } break;
  5398. case LLM_ARCH_MPT:
  5399. {
  5400. result = llm.build_mpt();
  5401. } break;
  5402. case LLM_ARCH_STABLELM:
  5403. {
  5404. result = llm.build_stablelm();
  5405. } break;
  5406. case LLM_ARCH_QWEN:
  5407. {
  5408. result = llm.build_qwen();
  5409. } break;
  5410. case LLM_ARCH_PHI2:
  5411. {
  5412. result = llm.build_phi2();
  5413. } break;
  5414. case LLM_ARCH_PLAMO:
  5415. {
  5416. result = llm.build_plamo();
  5417. } break;
  5418. case LLM_ARCH_GPT2:
  5419. {
  5420. result = llm.build_gpt2();
  5421. } break;
  5422. default:
  5423. GGML_ASSERT(false);
  5424. }
  5425. llm.free();
  5426. if (worst_case) {
  5427. int n_non_view_total = 0;
  5428. for (int i = 0; i < result->n_nodes; ++i) {
  5429. if (result->nodes[i]->view_src == nullptr) {
  5430. n_non_view_total++;
  5431. }
  5432. }
  5433. LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total);
  5434. if (n_non_view != n_non_view_total) {
  5435. LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
  5436. LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n", __func__);
  5437. LLAMA_LOG_WARN("%s: this can indicate an inefficiency in the graph implementation\n", __func__);
  5438. LLAMA_LOG_WARN("%s: build with LLAMA_OFFLOAD_DEBUG for more info\n", __func__);
  5439. LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n", __func__);
  5440. LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
  5441. }
  5442. }
  5443. return result;
  5444. }
  5445. // decode a batch of tokens by evaluating the transformer
  5446. //
  5447. // - lctx: llama context
  5448. // - batch: batch to evaluate
  5449. //
  5450. // return 0 on success
  5451. // return positive int on warning
  5452. // return negative int on error
  5453. //
  5454. static int llama_decode_internal(
  5455. llama_context & lctx,
  5456. llama_batch batch) {
  5457. const uint32_t n_tokens = batch.n_tokens;
  5458. if (n_tokens == 0) {
  5459. LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
  5460. return -1;
  5461. }
  5462. const auto & model = lctx.model;
  5463. const auto & hparams = model.hparams;
  5464. const auto & cparams = lctx.cparams;
  5465. const auto n_batch = cparams.n_batch;
  5466. GGML_ASSERT(n_tokens <= n_batch);
  5467. int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
  5468. GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
  5469. const int64_t t_start_us = ggml_time_us();
  5470. #ifdef GGML_USE_MPI
  5471. // TODO: needs fix after #3228
  5472. GGML_ASSERT(false && "not implemented");
  5473. //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
  5474. #endif
  5475. GGML_ASSERT(n_threads > 0);
  5476. auto & kv_self = lctx.kv_self;
  5477. GGML_ASSERT(!!kv_self.ctx);
  5478. const int64_t n_embd = hparams.n_embd;
  5479. const int64_t n_vocab = hparams.n_vocab;
  5480. // helpers for smoother batch API transition
  5481. // after deprecating the llama_eval calls, these will be removed
  5482. std::vector<llama_pos> pos;
  5483. std::vector<int32_t> n_seq_id;
  5484. std::vector<llama_seq_id *> seq_id_arr;
  5485. std::vector<std::vector<llama_seq_id>> seq_id;
  5486. if (batch.pos == nullptr) {
  5487. pos.resize(n_tokens);
  5488. for (uint32_t i = 0; i < n_tokens; i++) {
  5489. pos[i] = batch.all_pos_0 + i*batch.all_pos_1;
  5490. }
  5491. batch.pos = pos.data();
  5492. }
  5493. if (batch.seq_id == nullptr) {
  5494. n_seq_id.resize(n_tokens);
  5495. seq_id.resize(n_tokens);
  5496. seq_id_arr.resize(n_tokens);
  5497. for (uint32_t i = 0; i < n_tokens; i++) {
  5498. n_seq_id[i] = 1;
  5499. seq_id[i].resize(1);
  5500. seq_id[i][0] = batch.all_seq_id;
  5501. seq_id_arr[i] = seq_id[i].data();
  5502. }
  5503. batch.n_seq_id = n_seq_id.data();
  5504. batch.seq_id = seq_id_arr.data();
  5505. }
  5506. // if we have enough unused cells before the current head ->
  5507. // better to start searching from the beginning of the cache, hoping to fill it
  5508. if (kv_self.head > kv_self.used + 2*n_tokens) {
  5509. kv_self.head = 0;
  5510. }
  5511. if (!llama_kv_cache_find_slot(kv_self, batch)) {
  5512. return 1;
  5513. }
  5514. // a heuristic, to avoid attending the full cache if it is not yet utilized
  5515. // after enough generations, the benefit from this heuristic disappears
  5516. // if we start defragmenting the cache, the benefit from this will be more important
  5517. kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
  5518. //kv_self.n = llama_kv_cache_cell_max(kv_self);
  5519. //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
  5520. ggml_allocr_reset(lctx.alloc);
  5521. ggml_cgraph * gf = llama_build_graph(lctx, batch);
  5522. ggml_allocr_alloc_graph(lctx.alloc, gf);
  5523. // the output is always the last tensor in the graph
  5524. struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
  5525. GGML_ASSERT(strcmp(res->name, "result_output") == 0);
  5526. // the embeddings could be the second to last tensor, or the third to last tensor
  5527. struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
  5528. if (strcmp(embeddings->name, "result_norm") != 0) {
  5529. embeddings = gf->nodes[gf->n_nodes - 3];
  5530. GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
  5531. }
  5532. #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
  5533. char * buf_alloc_base = (char *)ggml_backend_buffer_get_base(lctx.buf_alloc);
  5534. for (int i = 0; i < gf->n_leafs; i++) {
  5535. ggml_tensor * node = gf->leafs[i];
  5536. if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
  5537. ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
  5538. ggml_cuda_copy_to_device(node);
  5539. }
  5540. }
  5541. for (int i = 0; i < gf->n_nodes; i++) {
  5542. ggml_tensor * node = gf->nodes[i];
  5543. if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
  5544. ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
  5545. }
  5546. }
  5547. // HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
  5548. if (!lctx.embedding.empty()) {
  5549. embeddings->backend = GGML_BACKEND_CPU;
  5550. }
  5551. res->backend = GGML_BACKEND_CPU;
  5552. #endif
  5553. // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
  5554. // for big prompts, if BLAS is enabled, it is better to use only one thread
  5555. // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
  5556. // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
  5557. // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
  5558. // with the BLAS calls. need a better solution
  5559. if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
  5560. n_threads = std::min(4, n_threads);
  5561. }
  5562. const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
  5563. if (ggml_cpu_has_cublas() && fully_offloaded) {
  5564. n_threads = 1;
  5565. }
  5566. #ifdef GGML_USE_MPI
  5567. const int64_t n_layer = hparams.n_layer;
  5568. ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
  5569. #endif
  5570. #ifdef GGML_USE_METAL
  5571. if (ggml_backend_is_metal(lctx.backend)) {
  5572. ggml_backend_metal_set_n_cb(lctx.backend, n_threads);
  5573. }
  5574. #endif
  5575. if (ggml_backend_is_cpu(lctx.backend)) {
  5576. ggml_backend_cpu_set_n_threads(lctx.backend, n_threads);
  5577. }
  5578. ggml_backend_graph_compute(lctx.backend, gf);
  5579. #ifdef GGML_USE_MPI
  5580. ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
  5581. #endif
  5582. // update the kv ring buffer
  5583. {
  5584. if (kv_self.has_shift) {
  5585. kv_self.has_shift = false;
  5586. for (uint32_t i = 0; i < kv_self.size; ++i) {
  5587. kv_self.cells[i].delta = 0;
  5588. }
  5589. }
  5590. kv_self.head += n_tokens;
  5591. // Ensure kv cache head points to a valid index.
  5592. if (kv_self.head >= kv_self.size) {
  5593. kv_self.head = 0;
  5594. }
  5595. }
  5596. #ifdef GGML_PERF
  5597. // print timing information per ggml operation (for debugging purposes)
  5598. // requires GGML_PERF to be defined
  5599. ggml_graph_print(gf);
  5600. #endif
  5601. // plot the computation graph in dot format (for debugging purposes)
  5602. //if (n_past%100 == 0) {
  5603. // ggml_graph_dump_dot(gf, NULL, "llama.dot");
  5604. //}
  5605. // extract logits
  5606. // TODO: do not compute and extract logits if only embeddings are needed
  5607. // need to update the graphs to skip "result_output"
  5608. {
  5609. auto & logits_out = lctx.logits;
  5610. #ifndef NDEBUG
  5611. auto & logits_valid = lctx.logits_valid;
  5612. logits_valid.clear();
  5613. logits_valid.resize(n_tokens);
  5614. logits_out.clear();
  5615. #endif
  5616. if (batch.logits) {
  5617. logits_out.resize(n_vocab * n_tokens);
  5618. for (uint32_t i = 0; i < n_tokens; i++) {
  5619. if (batch.logits[i] == 0) {
  5620. continue;
  5621. }
  5622. ggml_backend_tensor_get(res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
  5623. #ifndef NDEBUG
  5624. logits_valid[i] = true;
  5625. #endif
  5626. }
  5627. } else if (lctx.logits_all) {
  5628. logits_out.resize(n_vocab * n_tokens);
  5629. ggml_backend_tensor_get(res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
  5630. #ifndef NDEBUG
  5631. std::fill(logits_valid.begin(), logits_valid.end(), true);
  5632. #endif
  5633. } else {
  5634. logits_out.resize(n_vocab);
  5635. ggml_backend_tensor_get(res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
  5636. #ifndef NDEBUG
  5637. logits_valid[0] = true;
  5638. #endif
  5639. }
  5640. }
  5641. // extract embeddings
  5642. if (!lctx.embedding.empty()) {
  5643. auto & embedding_out = lctx.embedding;
  5644. embedding_out.resize(n_embd);
  5645. ggml_backend_tensor_get(embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
  5646. }
  5647. // measure the performance only for the single-token evals
  5648. if (n_tokens == 1) {
  5649. lctx.t_eval_us += ggml_time_us() - t_start_us;
  5650. lctx.n_eval++;
  5651. }
  5652. else if (n_tokens > 1) {
  5653. lctx.t_p_eval_us += ggml_time_us() - t_start_us;
  5654. lctx.n_p_eval += n_tokens;
  5655. }
  5656. // get a more accurate load time, upon first eval
  5657. // TODO: fix this
  5658. if (!lctx.has_evaluated_once) {
  5659. lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
  5660. lctx.has_evaluated_once = true;
  5661. }
  5662. return 0;
  5663. }
  5664. //
  5665. // tokenizer
  5666. //
  5667. static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
  5668. return vocab.type;
  5669. }
  5670. static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
  5671. return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
  5672. }
  5673. static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
  5674. return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
  5675. }
  5676. static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
  5677. return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
  5678. }
  5679. static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
  5680. return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
  5681. }
  5682. static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
  5683. return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
  5684. }
  5685. static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
  5686. GGML_ASSERT(llama_is_byte_token(vocab, id));
  5687. const auto& token_data = vocab.id_to_token.at(id);
  5688. switch (llama_vocab_get_type(vocab)) {
  5689. case LLAMA_VOCAB_TYPE_SPM: {
  5690. auto buf = token_data.text.substr(3, 2);
  5691. return strtol(buf.c_str(), NULL, 16);
  5692. }
  5693. case LLAMA_VOCAB_TYPE_BPE: {
  5694. GGML_ASSERT(false);
  5695. return unicode_to_bytes_bpe(token_data.text);
  5696. }
  5697. default:
  5698. GGML_ASSERT(false);
  5699. }
  5700. }
  5701. static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
  5702. static const char * hex = "0123456789ABCDEF";
  5703. switch (llama_vocab_get_type(vocab)) {
  5704. case LLAMA_VOCAB_TYPE_SPM: {
  5705. const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
  5706. return vocab.token_to_id.at(buf);
  5707. }
  5708. case LLAMA_VOCAB_TYPE_BPE: {
  5709. return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
  5710. }
  5711. default:
  5712. GGML_ASSERT(false);
  5713. }
  5714. }
  5715. static void llama_escape_whitespace(std::string & text) {
  5716. replace_all(text, " ", "\xe2\x96\x81");
  5717. }
  5718. static void llama_unescape_whitespace(std::string & word) {
  5719. replace_all(word, "\xe2\x96\x81", " ");
  5720. }
  5721. struct llm_symbol {
  5722. using index = int;
  5723. index prev;
  5724. index next;
  5725. const char * text;
  5726. size_t n;
  5727. };
  5728. static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not trivially copyable");
  5729. // SPM tokenizer
  5730. // original implementation:
  5731. // https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
  5732. struct llm_bigram_spm {
  5733. struct comparator {
  5734. bool operator()(llm_bigram_spm & l, llm_bigram_spm & r) {
  5735. return (l.score < r.score) || (l.score == r.score && l.left > r.left);
  5736. }
  5737. };
  5738. using queue_storage = std::vector<llm_bigram_spm>;
  5739. using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
  5740. llm_symbol::index left;
  5741. llm_symbol::index right;
  5742. float score;
  5743. size_t size;
  5744. };
  5745. struct llm_tokenizer_spm {
  5746. llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {}
  5747. void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
  5748. // split string into utf8 chars
  5749. int index = 0;
  5750. size_t offs = 0;
  5751. while (offs < text.size()) {
  5752. llm_symbol sym;
  5753. size_t len = utf8_len(text[offs]);
  5754. sym.text = text.c_str() + offs;
  5755. sym.n = std::min(len, text.size() - offs);
  5756. offs += sym.n;
  5757. sym.prev = index - 1;
  5758. sym.next = offs == text.size() ? -1 : index + 1;
  5759. index++;
  5760. symbols.emplace_back(sym);
  5761. }
  5762. // seed the work queue with all possible 2-character tokens.
  5763. for (size_t i = 1; i < symbols.size(); ++i) {
  5764. try_add_bigram(i - 1, i);
  5765. }
  5766. // keep substituting the highest frequency pairs for as long as we can.
  5767. while (!work_queue.empty()) {
  5768. auto bigram = work_queue.top();
  5769. work_queue.pop();
  5770. auto & left_sym = symbols[bigram.left];
  5771. auto & right_sym = symbols[bigram.right];
  5772. // if one of the symbols already got merged, skip it.
  5773. if (left_sym.n == 0 || right_sym.n == 0 ||
  5774. left_sym.n + right_sym.n != bigram.size) {
  5775. continue;
  5776. }
  5777. // merge the right sym into the left one
  5778. left_sym.n += right_sym.n;
  5779. right_sym.n = 0;
  5780. //LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
  5781. // remove the right sym from the chain
  5782. left_sym.next = right_sym.next;
  5783. if (right_sym.next >= 0) {
  5784. symbols[right_sym.next].prev = bigram.left;
  5785. }
  5786. // find more substitutions
  5787. try_add_bigram(left_sym.prev, bigram.left);
  5788. try_add_bigram(bigram.left, left_sym.next);
  5789. }
  5790. for (int i = 0; i != -1; i = symbols[i].next) {
  5791. auto & symbol = symbols[i];
  5792. resegment(symbol, output);
  5793. }
  5794. }
  5795. private:
  5796. void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) {
  5797. auto text = std::string(symbol.text, symbol.n);
  5798. auto token = vocab.token_to_id.find(text);
  5799. // Do we need to support is_unused?
  5800. if (token != vocab.token_to_id.end()) {
  5801. output.push_back((*token).second);
  5802. return;
  5803. }
  5804. const auto p = rev_merge.find(text);
  5805. if (p == rev_merge.end()) {
  5806. // output any symbols that did not form tokens as bytes.
  5807. for (int j = 0; j < (int)symbol.n; ++j) {
  5808. llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
  5809. output.push_back(token_id);
  5810. }
  5811. return;
  5812. }
  5813. resegment(symbols[p->second.first], output);
  5814. resegment(symbols[p->second.second], output);
  5815. }
  5816. void try_add_bigram(int left, int right) {
  5817. if (left == -1 || right == -1) {
  5818. return;
  5819. }
  5820. const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
  5821. auto token = vocab.token_to_id.find(text);
  5822. if (token == vocab.token_to_id.end()) {
  5823. return;
  5824. }
  5825. if (static_cast<size_t>((*token).second) >= vocab.id_to_token.size()) {
  5826. return;
  5827. }
  5828. const auto & tok_data = vocab.id_to_token[(*token).second];
  5829. llm_bigram_spm bigram;
  5830. bigram.left = left;
  5831. bigram.right = right;
  5832. bigram.score = tok_data.score;
  5833. bigram.size = text.size();
  5834. work_queue.push(bigram);
  5835. // Do we need to support is_unused?
  5836. rev_merge[text] = std::make_pair(left, right);
  5837. }
  5838. const llama_vocab & vocab;
  5839. std::vector<llm_symbol> symbols;
  5840. llm_bigram_spm::queue work_queue;
  5841. std::map<std::string, std::pair<int, int>> rev_merge;
  5842. };
  5843. // BPE tokenizer
  5844. // adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
  5845. // tried to simplify unicode stuff, so most likely does not work 100% correctly!
  5846. // TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
  5847. struct llm_bigram_bpe {
  5848. struct comparator {
  5849. bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
  5850. return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
  5851. }
  5852. };
  5853. using queue_storage = std::vector<llm_bigram_bpe>;
  5854. using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
  5855. llm_symbol::index left;
  5856. llm_symbol::index right;
  5857. std::string text;
  5858. int rank;
  5859. size_t size;
  5860. };
  5861. struct llm_tokenizer_bpe {
  5862. llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
  5863. void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
  5864. int final_prev_index = -1;
  5865. auto word_collection = bpe_gpt2_preprocess(text);
  5866. symbols_final.clear();
  5867. for (auto & word : word_collection) {
  5868. work_queue = llm_bigram_bpe::queue();
  5869. symbols.clear();
  5870. int index = 0;
  5871. size_t offset = 0;
  5872. while (offset < word.size()) {
  5873. llm_symbol sym;
  5874. size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
  5875. sym.text = word.c_str() + offset;
  5876. sym.n = char_len;
  5877. offset += sym.n;
  5878. sym.prev = index - 1;
  5879. sym.next = offset == word.size() ? -1 : index + 1;
  5880. index++;
  5881. symbols.emplace_back(sym);
  5882. }
  5883. for (size_t i = 1; i < symbols.size(); ++i) {
  5884. add_new_bigram(i - 1, i);
  5885. }
  5886. // build token(s)
  5887. while (!work_queue.empty()) {
  5888. auto bigram = work_queue.top();
  5889. work_queue.pop();
  5890. auto & left_symbol = symbols[bigram.left];
  5891. auto & right_symbol = symbols[bigram.right];
  5892. if (left_symbol.n == 0 || right_symbol.n == 0) {
  5893. continue;
  5894. }
  5895. std::string left_token = std::string(left_symbol.text, left_symbol.n);
  5896. std::string right_token = std::string(right_symbol.text, right_symbol.n);
  5897. if (left_token + right_token != bigram.text) {
  5898. continue; // Skip this bigram if it's outdated
  5899. }
  5900. // merge the right sym into the left one
  5901. left_symbol.n += right_symbol.n;
  5902. right_symbol.n = 0;
  5903. // remove the right sym from the chain
  5904. left_symbol.next = right_symbol.next;
  5905. if (right_symbol.next >= 0) {
  5906. symbols[right_symbol.next].prev = bigram.left;
  5907. }
  5908. add_new_bigram(left_symbol.prev, bigram.left); // left side of current symbol
  5909. add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
  5910. }
  5911. // add the fnished tokens to the final list keeping correct order for next and prev
  5912. for (auto & sym : symbols) {
  5913. if (sym.n > 0) {
  5914. sym.prev = final_prev_index;
  5915. sym.next = -1;
  5916. if (final_prev_index != -1) {
  5917. symbols_final[final_prev_index].next = symbols_final.size();
  5918. }
  5919. symbols_final.emplace_back(sym);
  5920. final_prev_index = symbols_final.size() - 1;
  5921. }
  5922. }
  5923. }
  5924. symbols = symbols_final;
  5925. if (!symbols.empty()) {
  5926. for (int i = 0; i != -1; i = symbols[i].next) {
  5927. auto & symbol = symbols[i];
  5928. if (symbol.n == 0) {
  5929. continue;
  5930. }
  5931. const std::string str = std::string(symbol.text, symbol.n);
  5932. const auto token = vocab.token_to_id.find(str);
  5933. if (token == vocab.token_to_id.end()) {
  5934. for (auto j = str.begin(); j != str.end(); ++j) {
  5935. std::string byte_str(1, *j);
  5936. auto token_multibyte = vocab.token_to_id.find(byte_str);
  5937. if (token_multibyte == vocab.token_to_id.end()) {
  5938. throw std::runtime_error("ERROR: byte not found in vocab");
  5939. }
  5940. output.push_back((*token_multibyte).second);
  5941. }
  5942. } else {
  5943. output.push_back((*token).second);
  5944. }
  5945. }
  5946. }
  5947. }
  5948. private:
  5949. void add_new_bigram(int left, int right) {
  5950. if (left == -1 || right == -1) {
  5951. return;
  5952. }
  5953. std::string left_token = std::string(symbols[left].text, symbols[left].n);
  5954. std::string right_token = std::string(symbols[right].text, symbols[right].n);
  5955. int rank_found = -1;
  5956. rank_found = vocab.find_bpe_rank(left_token, right_token);
  5957. if (rank_found < 0) {
  5958. return;
  5959. }
  5960. llm_bigram_bpe bigram;
  5961. bigram.left = left;
  5962. bigram.right = right;
  5963. bigram.text = left_token + right_token;
  5964. bigram.size = left_token.size() + right_token.size();
  5965. bigram.rank = rank_found;
  5966. work_queue.push(bigram);
  5967. }
  5968. std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
  5969. std::vector<std::string> bpe_words;
  5970. std::vector<std::string> bpe_encoded_words;
  5971. std::string token = "";
  5972. // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
  5973. bool collecting_numeric = false;
  5974. bool collecting_letter = false;
  5975. bool collecting_special = false;
  5976. bool collecting_whitespace_lookahead = false;
  5977. bool collecting = false;
  5978. std::vector<std::string> text_utf;
  5979. text_utf.reserve(text.size());
  5980. bpe_words.reserve(text.size());
  5981. bpe_encoded_words.reserve(text.size());
  5982. auto cps = codepoints_from_utf8(text);
  5983. for (size_t i = 0; i < cps.size(); ++i)
  5984. text_utf.emplace_back(codepoint_to_utf8(cps[i]));
  5985. for (int i = 0; i < (int)text_utf.size(); i++) {
  5986. const std::string & utf_char = text_utf[i];
  5987. bool split_condition = false;
  5988. int bytes_remain = text_utf.size() - i;
  5989. // forward backward lookups
  5990. const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
  5991. const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
  5992. // handling contractions
  5993. if (!split_condition && bytes_remain >= 2) {
  5994. // 's|'t|'m|'d
  5995. if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
  5996. split_condition = true;
  5997. }
  5998. if (split_condition) {
  5999. if (token.size()) {
  6000. bpe_words.emplace_back(token); // push previous content as token
  6001. }
  6002. token = utf_char + utf_char_next;
  6003. bpe_words.emplace_back(token);
  6004. token = "";
  6005. i++;
  6006. continue;
  6007. }
  6008. }
  6009. if (!split_condition && bytes_remain >= 3) {
  6010. // 're|'ve|'ll
  6011. if (utf_char == "\'" && (
  6012. (utf_char_next == "r" && utf_char_next_next == "e") ||
  6013. (utf_char_next == "v" && utf_char_next_next == "e") ||
  6014. (utf_char_next == "l" && utf_char_next_next == "l"))
  6015. ) {
  6016. split_condition = true;
  6017. }
  6018. if (split_condition) {
  6019. // current token + next token can be defined
  6020. if (token.size()) {
  6021. bpe_words.emplace_back(token); // push previous content as token
  6022. }
  6023. token = utf_char + utf_char_next + utf_char_next_next;
  6024. bpe_words.emplace_back(token); // the contraction
  6025. token = "";
  6026. i += 2;
  6027. continue;
  6028. }
  6029. }
  6030. if (!split_condition && !collecting) {
  6031. if (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
  6032. collecting_letter = true;
  6033. collecting = true;
  6034. }
  6035. else if (codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
  6036. collecting_numeric = true;
  6037. collecting = true;
  6038. }
  6039. else if (
  6040. ((codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (codepoint_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
  6041. (!token.size() && utf_char == " " && codepoint_type(utf_char_next) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
  6042. ) {
  6043. collecting_special = true;
  6044. collecting = true;
  6045. }
  6046. else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && codepoint_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
  6047. collecting_whitespace_lookahead = true;
  6048. collecting = true;
  6049. }
  6050. else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
  6051. split_condition = true;
  6052. }
  6053. }
  6054. else if (!split_condition && collecting) {
  6055. if (collecting_letter && codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER) {
  6056. split_condition = true;
  6057. }
  6058. else if (collecting_numeric && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
  6059. split_condition = true;
  6060. }
  6061. else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
  6062. split_condition = true;
  6063. }
  6064. else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
  6065. split_condition = true;
  6066. }
  6067. }
  6068. if (utf_char_next == "") {
  6069. split_condition = true; // final
  6070. token += utf_char;
  6071. }
  6072. if (split_condition) {
  6073. if (token.size()) {
  6074. bpe_words.emplace_back(token);
  6075. }
  6076. token = utf_char;
  6077. collecting = false;
  6078. collecting_letter = false;
  6079. collecting_numeric = false;
  6080. collecting_special = false;
  6081. collecting_whitespace_lookahead = false;
  6082. }
  6083. else {
  6084. token += utf_char;
  6085. }
  6086. }
  6087. for (std::string & word : bpe_words) {
  6088. std::string encoded_token = "";
  6089. for (char & c : word) {
  6090. encoded_token += bytes_to_unicode_bpe(c);
  6091. }
  6092. bpe_encoded_words.emplace_back(encoded_token);
  6093. }
  6094. return bpe_encoded_words;
  6095. }
  6096. const llama_vocab & vocab;
  6097. std::vector<llm_symbol> symbols;
  6098. std::vector<llm_symbol> symbols_final;
  6099. llm_bigram_bpe::queue work_queue;
  6100. };
  6101. typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
  6102. FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
  6103. FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
  6104. } FRAGMENT_BUFFER_VARIANT_TYPE;
  6105. struct fragment_buffer_variant{
  6106. fragment_buffer_variant(llama_vocab::id _token)
  6107. :
  6108. type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
  6109. token(_token),
  6110. raw_text(_dummy),
  6111. offset(0),
  6112. length(0){}
  6113. fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
  6114. :
  6115. type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
  6116. token((llama_vocab::id)-1),
  6117. raw_text(_raw_text),
  6118. offset(_offset),
  6119. length(_length){
  6120. GGML_ASSERT( _offset >= 0 );
  6121. GGML_ASSERT( _length >= 1 );
  6122. GGML_ASSERT( offset + length <= raw_text.length() );
  6123. }
  6124. const FRAGMENT_BUFFER_VARIANT_TYPE type;
  6125. const llama_vocab::id token;
  6126. const std::string _dummy;
  6127. const std::string & raw_text;
  6128. const uint64_t offset;
  6129. const uint64_t length;
  6130. };
  6131. // #define PRETOKENIZERDEBUG
  6132. static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
  6133. {
  6134. // for each special token
  6135. for (const auto & st: vocab.special_tokens_cache) {
  6136. const auto & special_token = st.first;
  6137. const auto & special_id = st.second;
  6138. // for each text fragment
  6139. std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
  6140. while (it != buffer.end()) {
  6141. auto & fragment = (*it);
  6142. // if a fragment is text ( not yet processed )
  6143. if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
  6144. auto * raw_text = &(fragment.raw_text);
  6145. auto raw_text_base_offset = fragment.offset;
  6146. auto raw_text_base_length = fragment.length;
  6147. // loop over the text
  6148. while (true) {
  6149. // find the first occurrence of a given special token in this fragment
  6150. // passing offset argument only limit the "search area" but match coordinates
  6151. // are still relative to the source full raw_text
  6152. auto match = raw_text->find(special_token, raw_text_base_offset);
  6153. // no occurrences found, stop processing this fragment for a given special token
  6154. if (match == std::string::npos) break;
  6155. // check if match is within bounds of offset <-> length
  6156. if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
  6157. #ifdef PRETOKENIZERDEBUG
  6158. fprintf(stderr, "FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
  6159. #endif
  6160. auto source = std::distance(buffer.begin(), it);
  6161. // if match is further than base offset
  6162. // then we have some text to the left of it
  6163. if (match > raw_text_base_offset) {
  6164. // left
  6165. const int64_t left_reminder_offset = raw_text_base_offset + 0;
  6166. const int64_t left_reminder_length = match - raw_text_base_offset;
  6167. buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
  6168. #ifdef PRETOKENIZERDEBUG
  6169. fprintf(stderr, "FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
  6170. #endif
  6171. it++;
  6172. }
  6173. // special token
  6174. buffer.emplace_after(it, special_id);
  6175. it++;
  6176. // right
  6177. if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
  6178. const int64_t right_reminder_offset = match + special_token.length();
  6179. const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
  6180. buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
  6181. #ifdef PRETOKENIZERDEBUG
  6182. fprintf(stderr, "FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
  6183. #endif
  6184. it++;
  6185. if (source == 0) {
  6186. buffer.erase_after(buffer.before_begin());
  6187. } else {
  6188. buffer.erase_after(std::next(buffer.begin(), (source-1)));
  6189. }
  6190. // repeat for the right side
  6191. raw_text_base_offset = right_reminder_offset;
  6192. raw_text_base_length = right_reminder_length;
  6193. #ifdef PRETOKENIZERDEBUG
  6194. fprintf(stderr, "RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
  6195. #endif
  6196. } else {
  6197. if (source == 0) {
  6198. buffer.erase_after(buffer.before_begin());
  6199. } else {
  6200. buffer.erase_after(std::next(buffer.begin(), (source-1)));
  6201. }
  6202. break;
  6203. }
  6204. }
  6205. }
  6206. it++;
  6207. }
  6208. }
  6209. }
  6210. static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
  6211. std::vector<llama_vocab::id> output;
  6212. // OG tokenizer behavior:
  6213. //
  6214. // tokenizer.encode('', add_bos=True) returns [1]
  6215. // tokenizer.encode('', add_bos=False) returns []
  6216. if (bos && vocab.special_bos_id != -1) {
  6217. output.push_back(vocab.special_bos_id);
  6218. }
  6219. if (raw_text.empty()) {
  6220. return output;
  6221. }
  6222. std::forward_list<fragment_buffer_variant> fragment_buffer;
  6223. fragment_buffer.emplace_front( raw_text, 0, raw_text.length() );
  6224. if (special) tokenizer_st_partition( vocab, fragment_buffer );
  6225. switch (vocab.type) {
  6226. case LLAMA_VOCAB_TYPE_SPM:
  6227. {
  6228. for (const auto & fragment: fragment_buffer)
  6229. {
  6230. if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
  6231. {
  6232. // without adding this leading whitespace, we do not get the same results as the original tokenizer
  6233. // TODO: It's likely possible to get rid of this string copy entirely
  6234. // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
  6235. // and passing 'add space prefix' as bool argument
  6236. //
  6237. auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
  6238. if (&fragment == &fragment_buffer.front()) {
  6239. raw_text = " " + raw_text; // prefix with space if the first token is not special
  6240. }
  6241. #ifdef PRETOKENIZERDEBUG
  6242. fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
  6243. #endif
  6244. llm_tokenizer_spm tokenizer(vocab);
  6245. llama_escape_whitespace(raw_text);
  6246. tokenizer.tokenize(raw_text, output);
  6247. }
  6248. else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
  6249. {
  6250. output.push_back(fragment.token);
  6251. }
  6252. }
  6253. } break;
  6254. case LLAMA_VOCAB_TYPE_BPE:
  6255. {
  6256. for (const auto & fragment: fragment_buffer)
  6257. {
  6258. if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
  6259. {
  6260. auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
  6261. #ifdef PRETOKENIZERDEBUG
  6262. fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
  6263. #endif
  6264. llm_tokenizer_bpe tokenizer(vocab);
  6265. tokenizer.tokenize(raw_text, output);
  6266. }
  6267. else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
  6268. {
  6269. output.push_back(fragment.token);
  6270. }
  6271. }
  6272. } break;
  6273. }
  6274. return output;
  6275. }
  6276. //
  6277. // grammar - internal
  6278. //
  6279. struct llama_partial_utf8 {
  6280. uint32_t value; // bit value so far (unshifted)
  6281. int n_remain; // num bytes remaining; -1 indicates invalid sequence
  6282. };
  6283. struct llama_grammar {
  6284. const std::vector<std::vector<llama_grammar_element>> rules;
  6285. std::vector<std::vector<const llama_grammar_element *>> stacks;
  6286. // buffer for partially generated UTF-8 sequence from accepted tokens
  6287. llama_partial_utf8 partial_utf8;
  6288. };
  6289. struct llama_grammar_candidate {
  6290. size_t index;
  6291. const uint32_t * code_points;
  6292. llama_partial_utf8 partial_utf8;
  6293. };
  6294. // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
  6295. // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
  6296. static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
  6297. const std::string & src,
  6298. llama_partial_utf8 partial_start) {
  6299. static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
  6300. const char * pos = src.c_str();
  6301. std::vector<uint32_t> code_points;
  6302. // common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
  6303. code_points.reserve(src.size() + 1);
  6304. uint32_t value = partial_start.value;
  6305. int n_remain = partial_start.n_remain;
  6306. // continue previous decode, if applicable
  6307. while (*pos != 0 && n_remain > 0) {
  6308. uint8_t next_byte = static_cast<uint8_t>(*pos);
  6309. if ((next_byte >> 6) != 2) {
  6310. // invalid sequence, abort
  6311. code_points.push_back(0);
  6312. return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, -1 });
  6313. }
  6314. value = (value << 6) + (next_byte & 0x3F);
  6315. ++pos;
  6316. --n_remain;
  6317. }
  6318. if (partial_start.n_remain > 0 && n_remain == 0) {
  6319. code_points.push_back(value);
  6320. }
  6321. // decode any subsequent utf-8 sequences, which may end in an incomplete one
  6322. while (*pos != 0) {
  6323. uint8_t first_byte = static_cast<uint8_t>(*pos);
  6324. uint8_t highbits = first_byte >> 4;
  6325. n_remain = lookup[highbits] - 1;
  6326. if (n_remain < 0) {
  6327. // invalid sequence, abort
  6328. code_points.clear();
  6329. code_points.push_back(0);
  6330. return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, n_remain });
  6331. }
  6332. uint8_t mask = (1 << (7 - n_remain)) - 1;
  6333. value = first_byte & mask;
  6334. ++pos;
  6335. while (*pos != 0 && n_remain > 0) {
  6336. value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
  6337. ++pos;
  6338. --n_remain;
  6339. }
  6340. if (n_remain == 0) {
  6341. code_points.push_back(value);
  6342. }
  6343. }
  6344. code_points.push_back(0);
  6345. return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
  6346. }
  6347. // returns true iff pos points to the end of one of the definitions of a rule
  6348. static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
  6349. switch (pos->type) {
  6350. case LLAMA_GRETYPE_END: return true; // NOLINT
  6351. case LLAMA_GRETYPE_ALT: return true; // NOLINT
  6352. default: return false;
  6353. }
  6354. }
  6355. // returns true iff chr satisfies the char range at pos (regular or inverse range)
  6356. // asserts that pos is pointing to a char range element
  6357. static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
  6358. const llama_grammar_element * pos,
  6359. const uint32_t chr) {
  6360. bool found = false;
  6361. bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
  6362. GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
  6363. do {
  6364. if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
  6365. // inclusive range, e.g. [a-z]
  6366. found = found || (pos->value <= chr && chr <= pos[1].value);
  6367. pos += 2;
  6368. } else {
  6369. // exact char match, e.g. [a] or "a"
  6370. found = found || pos->value == chr;
  6371. pos += 1;
  6372. }
  6373. } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
  6374. return std::make_pair(found == is_positive_char, pos);
  6375. }
  6376. // returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
  6377. // range at pos (regular or inverse range)
  6378. // asserts that pos is pointing to a char range element
  6379. static bool llama_grammar_match_partial_char(
  6380. const llama_grammar_element * pos,
  6381. const llama_partial_utf8 partial_utf8) {
  6382. bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
  6383. GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
  6384. uint32_t partial_value = partial_utf8.value;
  6385. int n_remain = partial_utf8.n_remain;
  6386. // invalid sequence or 7-bit char split across 2 bytes (overlong)
  6387. if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) {
  6388. return false;
  6389. }
  6390. // range of possible code points this partial UTF-8 sequence could complete to
  6391. uint32_t low = partial_value << (n_remain * 6);
  6392. uint32_t high = low | ((1 << (n_remain * 6)) - 1);
  6393. if (low == 0) {
  6394. if (n_remain == 2) {
  6395. low = 1 << 11;
  6396. } else if (n_remain == 3) {
  6397. low = 1 << 16;
  6398. }
  6399. }
  6400. do {
  6401. if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
  6402. // inclusive range, e.g. [a-z]
  6403. if (pos->value <= high && low <= pos[1].value) {
  6404. return is_positive_char;
  6405. }
  6406. pos += 2;
  6407. } else {
  6408. // exact char match, e.g. [a] or "a"
  6409. if (low <= pos->value && pos->value <= high) {
  6410. return is_positive_char;
  6411. }
  6412. pos += 1;
  6413. }
  6414. } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
  6415. return !is_positive_char;
  6416. }
  6417. // transforms a grammar pushdown stack into N possible stacks, all ending
  6418. // at a character range (terminal element)
  6419. static void llama_grammar_advance_stack(
  6420. const std::vector<std::vector<llama_grammar_element>> & rules,
  6421. const std::vector<const llama_grammar_element *> & stack,
  6422. std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
  6423. if (stack.empty()) {
  6424. new_stacks.emplace_back(stack);
  6425. return;
  6426. }
  6427. const llama_grammar_element * pos = stack.back();
  6428. switch (pos->type) {
  6429. case LLAMA_GRETYPE_RULE_REF: {
  6430. const size_t rule_id = static_cast<size_t>(pos->value);
  6431. const llama_grammar_element * subpos = rules[rule_id].data();
  6432. do {
  6433. // init new stack without the top (pos)
  6434. std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
  6435. if (!llama_grammar_is_end_of_sequence(pos + 1)) {
  6436. // if this rule ref is followed by another element, add that to stack
  6437. new_stack.push_back(pos + 1);
  6438. }
  6439. if (!llama_grammar_is_end_of_sequence(subpos)) {
  6440. // if alternate is nonempty, add to stack
  6441. new_stack.push_back(subpos);
  6442. }
  6443. llama_grammar_advance_stack(rules, new_stack, new_stacks);
  6444. while (!llama_grammar_is_end_of_sequence(subpos)) {
  6445. // scan to end of alternate def
  6446. subpos++;
  6447. }
  6448. if (subpos->type == LLAMA_GRETYPE_ALT) {
  6449. // there's another alternate def of this rule to process
  6450. subpos++;
  6451. } else {
  6452. break;
  6453. }
  6454. } while (true);
  6455. break;
  6456. }
  6457. case LLAMA_GRETYPE_CHAR:
  6458. case LLAMA_GRETYPE_CHAR_NOT:
  6459. new_stacks.emplace_back(stack);
  6460. break;
  6461. default:
  6462. // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
  6463. // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
  6464. // those
  6465. GGML_ASSERT(false);
  6466. }
  6467. }
  6468. // takes a set of possible pushdown stacks on a grammar, which are required to
  6469. // be positioned at a character range (see `llama_grammar_advance_stack`), and
  6470. // produces the N possible stacks if the given char is accepted at those
  6471. // positions
  6472. static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
  6473. const std::vector<std::vector<llama_grammar_element>> & rules,
  6474. const std::vector<std::vector<const llama_grammar_element *>> & stacks,
  6475. const uint32_t chr) {
  6476. std::vector<std::vector<const llama_grammar_element *>> new_stacks;
  6477. for (const auto & stack : stacks) {
  6478. if (stack.empty()) {
  6479. continue;
  6480. }
  6481. auto match = llama_grammar_match_char(stack.back(), chr);
  6482. if (match.first) {
  6483. const llama_grammar_element * pos = match.second;
  6484. // update top of stack to next element, if any
  6485. std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
  6486. if (!llama_grammar_is_end_of_sequence(pos)) {
  6487. new_stack.push_back(pos);
  6488. }
  6489. llama_grammar_advance_stack(rules, new_stack, new_stacks);
  6490. }
  6491. }
  6492. return new_stacks;
  6493. }
  6494. static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
  6495. const std::vector<std::vector<llama_grammar_element>> & rules,
  6496. const std::vector<std::vector<const llama_grammar_element *>> & stacks,
  6497. const std::vector<llama_grammar_candidate> & candidates);
  6498. static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
  6499. const std::vector<std::vector<llama_grammar_element>> & rules,
  6500. const std::vector<const llama_grammar_element *> & stack,
  6501. const std::vector<llama_grammar_candidate> & candidates) {
  6502. std::vector<llama_grammar_candidate> rejects;
  6503. if (stack.empty()) {
  6504. for (const auto & tok : candidates) {
  6505. if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
  6506. rejects.push_back(tok);
  6507. }
  6508. }
  6509. return rejects;
  6510. }
  6511. const llama_grammar_element * stack_pos = stack.back();
  6512. std::vector<llama_grammar_candidate> next_candidates;
  6513. for (const auto & tok : candidates) {
  6514. if (*tok.code_points == 0) {
  6515. // reached end of full codepoints in token, reject iff it ended in a partial sequence
  6516. // that cannot satisfy this position in grammar
  6517. if (tok.partial_utf8.n_remain != 0 &&
  6518. !llama_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
  6519. rejects.push_back(tok);
  6520. }
  6521. } else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
  6522. next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
  6523. } else {
  6524. rejects.push_back(tok);
  6525. }
  6526. }
  6527. const auto * stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
  6528. // update top of stack to next element, if any
  6529. std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
  6530. if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
  6531. stack_after.push_back(stack_pos_after);
  6532. }
  6533. std::vector<std::vector<const llama_grammar_element *>> next_stacks;
  6534. llama_grammar_advance_stack(rules, stack_after, next_stacks);
  6535. auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
  6536. for (const auto & tok : next_rejects) {
  6537. rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
  6538. }
  6539. return rejects;
  6540. }
  6541. static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
  6542. const std::vector<std::vector<llama_grammar_element>> & rules,
  6543. const std::vector<std::vector<const llama_grammar_element *>> & stacks,
  6544. const std::vector<llama_grammar_candidate> & candidates) {
  6545. GGML_ASSERT(!stacks.empty()); // REVIEW
  6546. if (candidates.empty()) {
  6547. return std::vector<llama_grammar_candidate>();
  6548. }
  6549. auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
  6550. for (size_t i = 1, size = stacks.size(); i < size; ++i) {
  6551. rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
  6552. }
  6553. return rejects;
  6554. }
  6555. //
  6556. // grammar - external
  6557. //
  6558. struct llama_grammar * llama_grammar_init(
  6559. const llama_grammar_element ** rules,
  6560. size_t n_rules,
  6561. size_t start_rule_index) {
  6562. const llama_grammar_element * pos;
  6563. // copy rule definitions into vectors
  6564. std::vector<std::vector<llama_grammar_element>> vec_rules(n_rules);
  6565. for (size_t i = 0; i < n_rules; i++) {
  6566. for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
  6567. vec_rules[i].push_back(*pos);
  6568. }
  6569. vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
  6570. }
  6571. // loop over alternates of start rule to build initial stacks
  6572. std::vector<std::vector<const llama_grammar_element *>> stacks;
  6573. pos = rules[start_rule_index];
  6574. do {
  6575. std::vector<const llama_grammar_element *> stack;
  6576. if (!llama_grammar_is_end_of_sequence(pos)) {
  6577. // if alternate is nonempty, add to stack
  6578. stack.push_back(pos);
  6579. }
  6580. llama_grammar_advance_stack(vec_rules, stack, stacks);
  6581. while (!llama_grammar_is_end_of_sequence(pos)) {
  6582. // scan to end of alternate def
  6583. pos++;
  6584. }
  6585. if (pos->type == LLAMA_GRETYPE_ALT) {
  6586. // there's another alternate def of this rule to process
  6587. pos++;
  6588. } else {
  6589. break;
  6590. }
  6591. } while (true);
  6592. return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
  6593. }
  6594. void llama_grammar_free(struct llama_grammar * grammar) {
  6595. delete grammar;
  6596. }
  6597. struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
  6598. llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
  6599. // redirect elements in stacks to point to new rules
  6600. for (size_t is = 0; is < result->stacks.size(); is++) {
  6601. for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
  6602. for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
  6603. for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
  6604. if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
  6605. result->stacks[is][ie] = &result->rules[ir0][ir1];
  6606. }
  6607. }
  6608. }
  6609. }
  6610. }
  6611. return result;
  6612. }
  6613. //
  6614. // sampling
  6615. //
  6616. void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
  6617. if (seed == LLAMA_DEFAULT_SEED) {
  6618. seed = time(NULL);
  6619. }
  6620. ctx->rng.seed(seed);
  6621. }
  6622. void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
  6623. GGML_ASSERT(candidates->size > 0);
  6624. const int64_t t_start_sample_us = ggml_time_us();
  6625. // Sort the logits in descending order
  6626. if (!candidates->sorted) {
  6627. std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
  6628. return a.logit > b.logit;
  6629. });
  6630. candidates->sorted = true;
  6631. }
  6632. float max_l = candidates->data[0].logit;
  6633. float cum_sum = 0.0f;
  6634. for (size_t i = 0; i < candidates->size; ++i) {
  6635. float p = expf(candidates->data[i].logit - max_l);
  6636. candidates->data[i].p = p;
  6637. cum_sum += p;
  6638. }
  6639. for (size_t i = 0; i < candidates->size; ++i) {
  6640. candidates->data[i].p /= cum_sum;
  6641. }
  6642. if (ctx) {
  6643. ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
  6644. }
  6645. }
  6646. void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
  6647. const int64_t t_start_sample_us = ggml_time_us();
  6648. k = std::max(k, (int) min_keep);
  6649. k = std::min(k, (int) candidates->size);
  6650. // Sort scores in descending order
  6651. if (!candidates->sorted) {
  6652. auto comp = [](const llama_token_data & a, const llama_token_data & b) {
  6653. return a.logit > b.logit;
  6654. };
  6655. if (k == (int) candidates->size) {
  6656. std::sort(candidates->data, candidates->data + candidates->size, comp);
  6657. } else {
  6658. std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
  6659. }
  6660. candidates->sorted = true;
  6661. }
  6662. candidates->size = k;
  6663. if (ctx) {
  6664. ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
  6665. }
  6666. }
  6667. void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
  6668. if (p >= 1.0f) {
  6669. return;
  6670. }
  6671. llama_sample_softmax(ctx, candidates);
  6672. const int64_t t_start_sample_us = ggml_time_us();
  6673. // Compute the cumulative probabilities
  6674. float cum_sum = 0.0f;
  6675. size_t last_idx = candidates->size;
  6676. for (size_t i = 0; i < candidates->size; ++i) {
  6677. cum_sum += candidates->data[i].p;
  6678. // Check if the running sum is at least p or if we have kept at least min_keep tokens
  6679. // we set the last index to i+1 to indicate that the current iterate should be included in the set
  6680. if (cum_sum >= p && i + 1 >= min_keep) {
  6681. last_idx = i + 1;
  6682. break;
  6683. }
  6684. }
  6685. // Resize the output vector to keep only the top-p tokens
  6686. candidates->size = last_idx;
  6687. if (ctx) {
  6688. ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
  6689. }
  6690. }
  6691. void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
  6692. if (p <= 0.0f || !candidates->size) {
  6693. return;
  6694. }
  6695. llama_sample_softmax(ctx, candidates);
  6696. const int64_t t_start_sample_us = ggml_time_us();
  6697. float scale = candidates->data[0].p; // scale by max prob
  6698. size_t i = 1; // first token always matches
  6699. for (; i < candidates->size; ++i) {
  6700. if (candidates->data[i].p < p * scale && i >= min_keep) {
  6701. break; // prob too small
  6702. }
  6703. }
  6704. // Resize the output vector to keep only the matching tokens
  6705. candidates->size = i;
  6706. if (ctx) {
  6707. ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
  6708. }
  6709. }
  6710. void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
  6711. if (z >= 1.0f || candidates->size <= 2) {
  6712. return;
  6713. }
  6714. llama_sample_softmax(nullptr, candidates);
  6715. const int64_t t_start_sample_us = ggml_time_us();
  6716. // Compute the first and second derivatives
  6717. std::vector<float> first_derivatives(candidates->size - 1);
  6718. std::vector<float> second_derivatives(candidates->size - 2);
  6719. for (size_t i = 0; i < first_derivatives.size(); ++i) {
  6720. first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p;
  6721. }
  6722. for (size_t i = 0; i < second_derivatives.size(); ++i) {
  6723. second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
  6724. }
  6725. // Calculate absolute value of second derivatives
  6726. for (size_t i = 0; i < second_derivatives.size(); ++i) {
  6727. second_derivatives[i] = std::abs(second_derivatives[i]);
  6728. }
  6729. // Normalize the second derivatives
  6730. {
  6731. const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
  6732. if (second_derivatives_sum > 1e-6f) {
  6733. for (float & value : second_derivatives) {
  6734. value /= second_derivatives_sum;
  6735. }
  6736. } else {
  6737. for (float & value : second_derivatives) {
  6738. value = 1.0f / second_derivatives.size();
  6739. }
  6740. }
  6741. }
  6742. float cum_sum = 0.0f;
  6743. size_t last_idx = candidates->size;
  6744. for (size_t i = 0; i < second_derivatives.size(); ++i) {
  6745. cum_sum += second_derivatives[i];
  6746. // Check if the running sum is greater than z or if we have kept at least min_keep tokens
  6747. if (cum_sum > z && i >= min_keep) {
  6748. last_idx = i;
  6749. break;
  6750. }
  6751. }
  6752. // Resize the output vector to keep only the tokens above the tail location
  6753. candidates->size = last_idx;
  6754. if (ctx) {
  6755. ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
  6756. }
  6757. }
  6758. void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
  6759. // Reference implementation:
  6760. // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
  6761. if (p >= 1.0f) {
  6762. return;
  6763. }
  6764. // Compute the softmax of logits and calculate entropy
  6765. llama_sample_softmax(nullptr, candidates);
  6766. const int64_t t_start_sample_us = ggml_time_us();
  6767. float entropy = 0.0f;
  6768. for (size_t i = 0; i < candidates->size; ++i) {
  6769. entropy += -candidates->data[i].p * logf(candidates->data[i].p);
  6770. }
  6771. // Compute the absolute difference between negative log probability and entropy for each candidate
  6772. std::vector<float> shifted_scores;
  6773. for (size_t i = 0; i < candidates->size; ++i) {
  6774. float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
  6775. shifted_scores.push_back(shifted_score);
  6776. }
  6777. // Sort tokens based on the shifted_scores and their corresponding indices
  6778. std::vector<size_t> indices(candidates->size);
  6779. std::iota(indices.begin(), indices.end(), 0);
  6780. std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
  6781. return shifted_scores[a] < shifted_scores[b];
  6782. });
  6783. // Compute the cumulative probabilities
  6784. float cum_sum = 0.0f;
  6785. size_t last_idx = indices.size();
  6786. for (size_t i = 0; i < indices.size(); ++i) {
  6787. size_t idx = indices[i];
  6788. cum_sum += candidates->data[idx].p;
  6789. // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
  6790. if (cum_sum > p && i >= min_keep - 1) {
  6791. last_idx = i + 1;
  6792. break;
  6793. }
  6794. }
  6795. // Resize the output vector to keep only the locally typical tokens
  6796. std::vector<llama_token_data> new_candidates;
  6797. for (size_t i = 0; i < last_idx; ++i) {
  6798. size_t idx = indices[i];
  6799. new_candidates.push_back(candidates->data[idx]);
  6800. }
  6801. // Replace the data in candidates with the new_candidates data
  6802. std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
  6803. candidates->size = new_candidates.size();
  6804. candidates->sorted = false;
  6805. if (ctx) {
  6806. ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
  6807. }
  6808. }
  6809. void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
  6810. const int64_t t_start_sample_us = ggml_time_us();
  6811. for (size_t i = 0; i < candidates_p->size; ++i) {
  6812. candidates_p->data[i].logit /= temp;
  6813. }
  6814. if (ctx) {
  6815. ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
  6816. }
  6817. }
  6818. void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
  6819. llama_sample_temp(ctx, candidates_p, temp);
  6820. }
  6821. void llama_sample_repetition_penalties(
  6822. struct llama_context * ctx,
  6823. llama_token_data_array * candidates,
  6824. const llama_token * last_tokens,
  6825. size_t penalty_last_n,
  6826. float penalty_repeat,
  6827. float penalty_freq,
  6828. float penalty_present) {
  6829. if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) {
  6830. return;
  6831. }
  6832. const int64_t t_start_sample_us = ggml_time_us();
  6833. // Create a frequency map to count occurrences of each token in last_tokens
  6834. std::unordered_map<llama_token, int> token_count;
  6835. for (size_t i = 0; i < penalty_last_n; ++i) {
  6836. token_count[last_tokens[i]]++;
  6837. }
  6838. // Apply frequency and presence penalties to the candidates
  6839. for (size_t i = 0; i < candidates->size; ++i) {
  6840. const auto token_iter = token_count.find(candidates->data[i].id);
  6841. if (token_iter == token_count.end()) {
  6842. continue;
  6843. }
  6844. const int count = token_iter->second;
  6845. // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
  6846. // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
  6847. if (candidates->data[i].logit <= 0) {
  6848. candidates->data[i].logit *= penalty_repeat;
  6849. } else {
  6850. candidates->data[i].logit /= penalty_repeat;
  6851. }
  6852. candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present;
  6853. }
  6854. candidates->sorted = false;
  6855. if (ctx) {
  6856. ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
  6857. }
  6858. }
  6859. void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
  6860. GGML_ASSERT(ctx);
  6861. const int64_t t_start_sample_us = ggml_time_us();
  6862. bool allow_eos = false;
  6863. for (const auto & stack : grammar->stacks) {
  6864. if (stack.empty()) {
  6865. allow_eos = true;
  6866. break;
  6867. }
  6868. }
  6869. const llama_token eos = llama_token_eos(&ctx->model);
  6870. std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
  6871. candidates_decoded.reserve(candidates->size);
  6872. std::vector<llama_grammar_candidate> candidates_grammar;
  6873. candidates_grammar.reserve(candidates->size);
  6874. for (size_t i = 0; i < candidates->size; ++i) {
  6875. const llama_token id = candidates->data[i].id;
  6876. const std::string piece = llama_token_to_piece(ctx, id);
  6877. if (id == eos) {
  6878. if (!allow_eos) {
  6879. candidates->data[i].logit = -INFINITY;
  6880. }
  6881. } else if (piece.empty() || piece[0] == 0) {
  6882. candidates->data[i].logit = -INFINITY;
  6883. } else {
  6884. candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
  6885. candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
  6886. }
  6887. }
  6888. const auto rejects = llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
  6889. for (const auto & reject : rejects) {
  6890. candidates->data[reject.index].logit = -INFINITY;
  6891. }
  6892. ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
  6893. }
  6894. static void llama_log_softmax(float * array, size_t size) {
  6895. float max_l = *std::max_element(array, array + size);
  6896. float sum = 0.f;
  6897. for (size_t i = 0; i < size; ++i) {
  6898. float p = expf(array[i] - max_l);
  6899. sum += p;
  6900. array[i] = p;
  6901. }
  6902. for (size_t i = 0; i < size; ++i) {
  6903. array[i] = logf(array[i] / sum);
  6904. }
  6905. }
  6906. void llama_sample_classifier_free_guidance(
  6907. struct llama_context * ctx,
  6908. llama_token_data_array * candidates,
  6909. struct llama_context * guidance_ctx,
  6910. float scale) {
  6911. int64_t t_start_sample_us = ggml_time_us();
  6912. GGML_ASSERT(ctx);
  6913. auto n_vocab = llama_n_vocab(llama_get_model(ctx));
  6914. GGML_ASSERT(n_vocab == (int)candidates->size);
  6915. GGML_ASSERT(!candidates->sorted);
  6916. std::vector<float> logits_base;
  6917. logits_base.reserve(candidates->size);
  6918. for (size_t i = 0; i < candidates->size; ++i) {
  6919. logits_base.push_back(candidates->data[i].logit);
  6920. }
  6921. llama_log_softmax(logits_base.data(), candidates->size);
  6922. float* logits_guidance = llama_get_logits(guidance_ctx);
  6923. llama_log_softmax(logits_guidance, n_vocab);
  6924. for (int i = 0; i < n_vocab; ++i) {
  6925. float logit_guidance = logits_guidance[i];
  6926. float logit_base = logits_base[i];
  6927. candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
  6928. }
  6929. if (ctx) {
  6930. ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
  6931. }
  6932. }
  6933. llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
  6934. GGML_ASSERT(ctx);
  6935. auto N = float(llama_n_vocab(llama_get_model(ctx)));
  6936. int64_t t_start_sample_us;
  6937. t_start_sample_us = ggml_time_us();
  6938. llama_sample_softmax(nullptr, candidates);
  6939. // Estimate s_hat using the most probable m tokens
  6940. float s_hat = 0.0;
  6941. float sum_ti_bi = 0.0;
  6942. float sum_ti_sq = 0.0;
  6943. for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) {
  6944. float t_i = logf(float(i + 2) / float(i + 1));
  6945. float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p);
  6946. sum_ti_bi += t_i * b_i;
  6947. sum_ti_sq += t_i * t_i;
  6948. }
  6949. s_hat = sum_ti_bi / sum_ti_sq;
  6950. // Compute k from the estimated s_hat and target surprise value
  6951. float epsilon_hat = s_hat - 1;
  6952. float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
  6953. // Sample the next word X using top-k sampling
  6954. llama_sample_top_k(nullptr, candidates, int(k), 1);
  6955. if (ctx) {
  6956. ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
  6957. }
  6958. llama_token X = llama_sample_token(ctx, candidates);
  6959. t_start_sample_us = ggml_time_us();
  6960. // Compute error as the difference between observed surprise and target surprise value
  6961. size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
  6962. return candidate.id == X;
  6963. }));
  6964. float observed_surprise = -log2f(candidates->data[X_idx].p);
  6965. float e = observed_surprise - tau;
  6966. // Update mu using the learning rate and error
  6967. *mu = *mu - eta * e;
  6968. if (ctx) {
  6969. ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
  6970. }
  6971. return X;
  6972. }
  6973. llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
  6974. int64_t t_start_sample_us;
  6975. t_start_sample_us = ggml_time_us();
  6976. llama_sample_softmax(ctx, candidates);
  6977. // Truncate the words with surprise values greater than mu
  6978. candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
  6979. return -log2f(candidate.p) > *mu;
  6980. }));
  6981. if (candidates->size == 0) {
  6982. candidates->size = 1;
  6983. }
  6984. if (ctx) {
  6985. ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
  6986. }
  6987. // Normalize the probabilities of the remaining words
  6988. llama_sample_softmax(ctx, candidates);
  6989. // Sample the next word X from the remaining words
  6990. llama_token X = llama_sample_token(ctx, candidates);
  6991. t_start_sample_us = ggml_time_us();
  6992. // Compute error as the difference between observed surprise and target surprise value
  6993. size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
  6994. return candidate.id == X;
  6995. }));
  6996. float observed_surprise = -log2f(candidates->data[X_idx].p);
  6997. float e = observed_surprise - tau;
  6998. // Update mu using the learning rate and error
  6999. *mu = *mu - eta * e;
  7000. if (ctx) {
  7001. ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
  7002. }
  7003. return X;
  7004. }
  7005. llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
  7006. const int64_t t_start_sample_us = ggml_time_us();
  7007. // Find max element
  7008. auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
  7009. return a.logit < b.logit;
  7010. });
  7011. llama_token result = max_iter->id;
  7012. if (ctx) {
  7013. ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
  7014. ctx->n_sample++;
  7015. }
  7016. return result;
  7017. }
  7018. llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
  7019. GGML_ASSERT(ctx);
  7020. const int64_t t_start_sample_us = ggml_time_us();
  7021. llama_sample_softmax(nullptr, candidates);
  7022. std::vector<float> probs;
  7023. probs.reserve(candidates->size);
  7024. for (size_t i = 0; i < candidates->size; ++i) {
  7025. probs.push_back(candidates->data[i].p);
  7026. }
  7027. std::discrete_distribution<> dist(probs.begin(), probs.end());
  7028. auto & rng = ctx->rng;
  7029. int idx = dist(rng);
  7030. llama_token result = candidates->data[idx].id;
  7031. ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
  7032. ctx->n_sample++;
  7033. return result;
  7034. }
  7035. void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
  7036. const int64_t t_start_sample_us = ggml_time_us();
  7037. if (token == llama_token_eos(&ctx->model)) {
  7038. for (const auto & stack : grammar->stacks) {
  7039. if (stack.empty()) {
  7040. return;
  7041. }
  7042. }
  7043. GGML_ASSERT(false);
  7044. }
  7045. const std::string piece = llama_token_to_piece(ctx, token);
  7046. // Note terminating 0 in decoded string
  7047. const auto decoded = decode_utf8(piece, grammar->partial_utf8);
  7048. const auto & code_points = decoded.first;
  7049. for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
  7050. grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
  7051. }
  7052. grammar->partial_utf8 = decoded.second;
  7053. GGML_ASSERT(!grammar->stacks.empty());
  7054. ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
  7055. }
  7056. //
  7057. // Beam search
  7058. //
  7059. struct llama_beam {
  7060. std::vector<llama_token> tokens;
  7061. float p; // Cumulative beam probability (renormalized relative to all beams)
  7062. bool eob; // Initialize end-of-beam to false. Callback sets this to true.
  7063. // Sort beams by probability. In case of ties, prefer beams at eob.
  7064. bool operator<(const llama_beam & rhs) const {
  7065. return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
  7066. }
  7067. // Shift off first n tokens and discard them.
  7068. void shift_tokens(const size_t n) {
  7069. if (n) {
  7070. std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
  7071. tokens.resize(tokens.size() - n);
  7072. }
  7073. }
  7074. llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
  7075. };
  7076. // A struct for calculating logit-related info.
  7077. struct llama_logit_info {
  7078. const float * const logits;
  7079. const int n_vocab;
  7080. const float max_l;
  7081. const float normalizer;
  7082. struct sum_exp {
  7083. float max_l;
  7084. float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
  7085. };
  7086. llama_logit_info(llama_context * ctx)
  7087. : logits(llama_get_logits(ctx))
  7088. , n_vocab(llama_n_vocab(llama_get_model(ctx)))
  7089. , max_l(*std::max_element(logits, logits + n_vocab))
  7090. , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
  7091. { }
  7092. llama_token_data get_token_data(const llama_token token_id) const {
  7093. constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
  7094. return {token_id, logits[token_id], p};
  7095. }
  7096. // Return top k token_data by logit.
  7097. std::vector<llama_token_data> top_k(size_t k) {
  7098. std::vector<llama_token_data> min_heap; // min-heap by logit
  7099. const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
  7100. min_heap.reserve(k_min);
  7101. for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
  7102. min_heap.push_back(get_token_data(token_id));
  7103. }
  7104. auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
  7105. std::make_heap(min_heap.begin(), min_heap.end(), comp);
  7106. for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
  7107. if (min_heap.front().logit < logits[token_id]) {
  7108. std::pop_heap(min_heap.begin(), min_heap.end(), comp);
  7109. min_heap.back().id = token_id;
  7110. min_heap.back().logit = logits[token_id];
  7111. std::push_heap(min_heap.begin(), min_heap.end(), comp);
  7112. }
  7113. }
  7114. return min_heap;
  7115. }
  7116. float probability_from_logit(float logit) const {
  7117. return normalizer * std::exp(logit - max_l);
  7118. }
  7119. };
  7120. struct llama_beam_search_data {
  7121. llama_context * ctx;
  7122. size_t n_beams;
  7123. int n_past;
  7124. int n_predict;
  7125. std::vector<llama_beam> beams;
  7126. std::vector<llama_beam> next_beams;
  7127. // Re-calculated on each loop iteration
  7128. size_t common_prefix_length;
  7129. // Used to communicate to/from callback on beams state.
  7130. std::vector<llama_beam_view> beam_views;
  7131. llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
  7132. : ctx(ctx)
  7133. , n_beams(n_beams)
  7134. , n_past(n_past)
  7135. , n_predict(n_predict)
  7136. , beam_views(n_beams) {
  7137. beams.reserve(n_beams);
  7138. next_beams.reserve(n_beams);
  7139. }
  7140. // Collapse beams to a single beam given by index.
  7141. void collapse_beams(const size_t beam_idx) {
  7142. if (0u < beam_idx) {
  7143. std::swap(beams[0], beams[beam_idx]);
  7144. }
  7145. beams.resize(1);
  7146. }
  7147. // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
  7148. // The repetitive patterns below reflect the 2 stages of heaps:
  7149. // * Gather elements until the vector is full, then call std::make_heap() on it.
  7150. // * If the heap is full and a new element is found that should be included, pop the
  7151. // least element to the back(), replace it with the new, then push it into the heap.
  7152. void fill_next_beams_by_top_probabilities(llama_beam & beam) {
  7153. // Min-heaps use a greater-than comparator.
  7154. const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
  7155. if (beam.eob) {
  7156. // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
  7157. if (next_beams.size() < n_beams) {
  7158. next_beams.push_back(std::move(beam));
  7159. if (next_beams.size() == n_beams) {
  7160. std::make_heap(next_beams.begin(), next_beams.end(), comp);
  7161. }
  7162. } else if (next_beams.front().p < beam.p) {
  7163. std::pop_heap(next_beams.begin(), next_beams.end(), comp);
  7164. next_beams.back() = std::move(beam);
  7165. std::push_heap(next_beams.begin(), next_beams.end(), comp);
  7166. }
  7167. } else {
  7168. // beam is not at end-of-sentence, so branch with next top_k tokens.
  7169. if (!beam.tokens.empty()) {
  7170. llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
  7171. }
  7172. llama_logit_info logit_info(ctx);
  7173. std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
  7174. size_t i=0;
  7175. if (next_beams.size() < n_beams) {
  7176. for (; next_beams.size() < n_beams ; ++i) {
  7177. llama_beam next_beam = beam;
  7178. next_beam.tokens.push_back(next_tokens[i].id);
  7179. next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
  7180. next_beams.push_back(std::move(next_beam));
  7181. }
  7182. std::make_heap(next_beams.begin(), next_beams.end(), comp);
  7183. } else {
  7184. for (; next_beams.front().p == 0.0f ; ++i) {
  7185. std::pop_heap(next_beams.begin(), next_beams.end(), comp);
  7186. next_beams.back() = beam;
  7187. next_beams.back().tokens.push_back(next_tokens[i].id);
  7188. next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
  7189. std::push_heap(next_beams.begin(), next_beams.end(), comp);
  7190. }
  7191. }
  7192. for (; i < n_beams ; ++i) {
  7193. const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
  7194. if (next_beams.front().p < next_p) {
  7195. std::pop_heap(next_beams.begin(), next_beams.end(), comp);
  7196. next_beams.back() = beam;
  7197. next_beams.back().tokens.push_back(next_tokens[i].id);
  7198. next_beams.back().p = next_p;
  7199. std::push_heap(next_beams.begin(), next_beams.end(), comp);
  7200. }
  7201. }
  7202. }
  7203. }
  7204. // Find common_prefix_length based on beams.
  7205. // Requires beams is not empty.
  7206. size_t find_common_prefix_length() {
  7207. size_t common_prefix_length = beams[0].tokens.size();
  7208. for (size_t i = 1 ; i < beams.size() ; ++i) {
  7209. common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
  7210. for (size_t j = 0 ; j < common_prefix_length ; ++j) {
  7211. if (beams[0].tokens[j] != beams[i].tokens[j]) {
  7212. common_prefix_length = j;
  7213. break;
  7214. }
  7215. }
  7216. }
  7217. return common_prefix_length;
  7218. }
  7219. // Construct beams_state to send back to caller via the callback function.
  7220. // Side effect: set common_prefix_length = find_common_prefix_length();
  7221. llama_beams_state get_beams_state(const bool last_call) {
  7222. for (size_t i = 0 ; i < beams.size() ; ++i) {
  7223. beam_views[i] = beams[i].view();
  7224. }
  7225. common_prefix_length = find_common_prefix_length();
  7226. return {beam_views.data(), beams.size(), common_prefix_length, last_call};
  7227. }
  7228. // Loop:
  7229. // * while i < n_predict, AND
  7230. // * any of the beams have not yet reached end-of-beam (eob), AND
  7231. // * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
  7232. // (since all other beam probabilities can only decrease)
  7233. void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
  7234. beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
  7235. const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
  7236. for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
  7237. !beams[top_beam_index()].eob ; ++i) {
  7238. callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
  7239. update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
  7240. if (common_prefix_length) {
  7241. llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
  7242. n_past += common_prefix_length;
  7243. }
  7244. // Zero-out next_beam probabilities to place them last in following min-heap.
  7245. std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
  7246. for (llama_beam & beam : beams) {
  7247. beam.shift_tokens(common_prefix_length);
  7248. fill_next_beams_by_top_probabilities(beam);
  7249. }
  7250. // next_beams become the beams of next/final iteration. Swap them to re-use memory.
  7251. beams.swap(next_beams);
  7252. renormalize_beam_probabilities(beams);
  7253. }
  7254. collapse_beams(top_beam_index());
  7255. callback(callback_data, get_beams_state(true));
  7256. }
  7257. // As beams grow, the cumulative probabilities decrease.
  7258. // Renormalize them to avoid floating point underflow.
  7259. static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
  7260. const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
  7261. const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
  7262. std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
  7263. }
  7264. // Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
  7265. size_t top_beam_index() {
  7266. return std::max_element(beams.begin(), beams.end()) - beams.begin();
  7267. }
  7268. // Copy (p,eob) for each beam which may have been changed by the callback.
  7269. void update_beams_from_beam_views() {
  7270. for (size_t i = 0 ; i < beams.size() ; ++i) {
  7271. beams[i].p = beam_views[i].p;
  7272. beams[i].eob = beam_views[i].eob;
  7273. }
  7274. }
  7275. };
  7276. void llama_beam_search(llama_context * ctx,
  7277. llama_beam_search_callback_fn_t callback, void * callback_data,
  7278. size_t n_beams, int n_past, int n_predict) {
  7279. assert(ctx);
  7280. const int64_t t_start_sample_us = ggml_time_us();
  7281. llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
  7282. beam_search_data.loop(callback, callback_data);
  7283. ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
  7284. ctx->n_sample++;
  7285. }
  7286. //
  7287. // quantization
  7288. //
  7289. struct quantize_state_internal {
  7290. const llama_model & model;
  7291. const llama_model_quantize_params * params;
  7292. int n_attention_wv = 0;
  7293. int n_feed_forward_w2 = 0;
  7294. int i_attention_wv = 0;
  7295. int i_feed_forward_w2 = 0;
  7296. int n_k_quantized = 0;
  7297. int n_fallback = 0;
  7298. quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
  7299. : model(model)
  7300. , params(params)
  7301. {}
  7302. };
  7303. static void llama_convert_tensor_internal(
  7304. struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
  7305. const size_t nelements, const int nthread
  7306. ) {
  7307. if (output.size() < nelements) {
  7308. output.resize(nelements);
  7309. }
  7310. float * f32_output = (float *) output.data();
  7311. ggml_type_traits_t qtype;
  7312. if (ggml_is_quantized(tensor->type)) {
  7313. qtype = ggml_internal_get_type_traits(tensor->type);
  7314. if (qtype.to_float == NULL) {
  7315. throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
  7316. }
  7317. } else if (tensor->type != GGML_TYPE_F16) {
  7318. throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
  7319. }
  7320. if (nthread < 2) {
  7321. if (tensor->type == GGML_TYPE_F16) {
  7322. ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
  7323. } else if (ggml_is_quantized(tensor->type)) {
  7324. qtype.to_float(tensor->data, f32_output, nelements);
  7325. } else {
  7326. GGML_ASSERT(false); // unreachable
  7327. }
  7328. return;
  7329. }
  7330. size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
  7331. size_t block_size_bytes = ggml_type_size(tensor->type);
  7332. GGML_ASSERT(nelements % block_size == 0);
  7333. size_t nblocks = nelements / block_size;
  7334. size_t blocks_per_thread = nblocks / nthread;
  7335. size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
  7336. size_t in_buff_offs = 0;
  7337. size_t out_buff_offs = 0;
  7338. for (int tnum = 0; tnum < nthread; tnum++) {
  7339. size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
  7340. size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
  7341. size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
  7342. auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
  7343. if (typ == GGML_TYPE_F16) {
  7344. ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
  7345. } else {
  7346. qtype.to_float(inbuf, outbuf, nels);
  7347. }
  7348. };
  7349. workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
  7350. in_buff_offs += thr_block_bytes;
  7351. out_buff_offs += thr_elems;
  7352. }
  7353. for (auto & w : workers) { w.join(); }
  7354. workers.clear();
  7355. }
  7356. static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
  7357. const std::string name = ggml_get_name(tensor);
  7358. // TODO: avoid hardcoded tensor names - use the TN_* constants
  7359. const llm_arch arch = qs.model.arch;
  7360. const auto tn = LLM_TN(arch);
  7361. auto use_more_bits = [](int i_layer, int num_layers) -> bool {
  7362. return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
  7363. };
  7364. if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
  7365. int nx = tensor->ne[0];
  7366. if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
  7367. new_type = GGML_TYPE_Q8_0;
  7368. }
  7369. else if (new_type != GGML_TYPE_Q8_0) {
  7370. new_type = GGML_TYPE_Q6_K;
  7371. }
  7372. } else if (name.find("attn_v.weight") != std::string::npos) {
  7373. if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
  7374. else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
  7375. new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
  7376. }
  7377. else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
  7378. else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
  7379. use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
  7380. else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
  7381. else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
  7382. (qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
  7383. if (qs.model.type == MODEL_70B) {
  7384. // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
  7385. // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
  7386. // nearly negligible increase in model size by quantizing this tensor with more bits:
  7387. if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
  7388. }
  7389. if (qs.model.hparams.n_expert == 8) {
  7390. // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
  7391. // TODO: explore better strategies
  7392. new_type = GGML_TYPE_Q8_0;
  7393. }
  7394. ++qs.i_attention_wv;
  7395. } else if (name.find("attn_k.weight") != std::string::npos) {
  7396. if (qs.model.hparams.n_expert == 8) {
  7397. // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
  7398. // TODO: explore better strategies
  7399. new_type = GGML_TYPE_Q8_0;
  7400. }
  7401. } else if (name.find("ffn_down") != std::string::npos) {
  7402. if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
  7403. else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
  7404. if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K;
  7405. }
  7406. else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
  7407. new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K
  7408. : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
  7409. : GGML_TYPE_Q3_K;
  7410. }
  7411. else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
  7412. new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
  7413. }
  7414. else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
  7415. if (arch == LLM_ARCH_FALCON) {
  7416. new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K :
  7417. use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
  7418. } else {
  7419. if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
  7420. }
  7421. }
  7422. else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
  7423. else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) {
  7424. new_type = GGML_TYPE_Q5_K;
  7425. }
  7426. ++qs.i_feed_forward_w2;
  7427. } else if (name.find("attn_output.weight") != std::string::npos) {
  7428. if (arch != LLM_ARCH_FALCON) {
  7429. if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
  7430. else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
  7431. else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
  7432. } else {
  7433. if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
  7434. }
  7435. }
  7436. else if (name.find("attn_qkv.weight") != std::string::npos) {
  7437. if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
  7438. else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
  7439. else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
  7440. }
  7441. // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
  7442. //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
  7443. // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
  7444. //}
  7445. // This can be used to reduce the size of the Q5_K_S model.
  7446. // The associated PPL increase is fully in line with the size reduction
  7447. //else {
  7448. // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
  7449. //}
  7450. bool convert_incompatible_tensor = false;
  7451. if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
  7452. new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
  7453. int nx = tensor->ne[0];
  7454. int ny = tensor->ne[1];
  7455. if (nx % QK_K != 0) {
  7456. LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
  7457. convert_incompatible_tensor = true;
  7458. } else {
  7459. ++qs.n_k_quantized;
  7460. }
  7461. }
  7462. if (convert_incompatible_tensor) {
  7463. switch (new_type) {
  7464. case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
  7465. case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
  7466. case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
  7467. case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
  7468. case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
  7469. default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
  7470. }
  7471. LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
  7472. ++qs.n_fallback;
  7473. }
  7474. return new_type;
  7475. }
  7476. static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
  7477. ggml_type quantized_type;
  7478. llama_ftype ftype = params->ftype;
  7479. switch (params->ftype) {
  7480. case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
  7481. case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
  7482. case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
  7483. case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
  7484. case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
  7485. case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
  7486. case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
  7487. // K-quants
  7488. case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
  7489. case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
  7490. case LLAMA_FTYPE_MOSTLY_Q3_K_S:
  7491. case LLAMA_FTYPE_MOSTLY_Q3_K_M:
  7492. case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
  7493. case LLAMA_FTYPE_MOSTLY_Q4_K_S:
  7494. case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
  7495. case LLAMA_FTYPE_MOSTLY_Q5_K_S:
  7496. case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
  7497. case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
  7498. case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
  7499. case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
  7500. default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
  7501. }
  7502. int nthread = params->nthread;
  7503. if (nthread <= 0) {
  7504. nthread = std::thread::hardware_concurrency();
  7505. }
  7506. // mmap consistently increases speed Linux, and also increases speed on Windows with
  7507. // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
  7508. #if defined(__linux__) || defined(_WIN32)
  7509. constexpr bool use_mmap = true;
  7510. #else
  7511. constexpr bool use_mmap = false;
  7512. #endif
  7513. llama_model_loader ml(fname_inp, use_mmap, NULL);
  7514. ml.init_mapping(false); // no prefetching?
  7515. llama_model model;
  7516. llm_load_arch(ml, model);
  7517. llm_load_hparams(ml, model);
  7518. struct quantize_state_internal qs(model, params);
  7519. if (params->only_copy) {
  7520. ftype = model.ftype;
  7521. }
  7522. const size_t align = GGUF_DEFAULT_ALIGNMENT;
  7523. struct gguf_context * ctx_out = gguf_init_empty();
  7524. // copy the KV pairs from the input file
  7525. gguf_set_kv (ctx_out, ml.ctx_gguf);
  7526. gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
  7527. gguf_set_val_u32(ctx_out, "general.file_type", ftype);
  7528. for (int i = 0; i < ml.n_tensors; ++i) {
  7529. struct ggml_tensor * meta = ml.get_tensor_meta(i);
  7530. const std::string name = ggml_get_name(meta);
  7531. // TODO: avoid hardcoded tensor names - use the TN_* constants
  7532. if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
  7533. ++qs.n_attention_wv;
  7534. }
  7535. else if (name.find("ffn_down") != std::string::npos) {
  7536. ++qs.n_feed_forward_w2;
  7537. }
  7538. }
  7539. if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
  7540. LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
  7541. __func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer);
  7542. }
  7543. size_t total_size_org = 0;
  7544. size_t total_size_new = 0;
  7545. std::vector<int64_t> hist_all(1 << 4, 0);
  7546. std::vector<std::thread> workers;
  7547. workers.reserve(nthread);
  7548. std::mutex mutex;
  7549. int idx = 0;
  7550. std::vector<no_init<uint8_t>> read_data;
  7551. std::vector<no_init<uint8_t>> work;
  7552. std::vector<no_init<float>> f32_conv_buf;
  7553. // populate the original tensors so we get an initial meta data
  7554. for (int i = 0; i < ml.n_tensors; ++i) {
  7555. struct ggml_tensor * meta = ml.get_tensor_meta(i);
  7556. gguf_add_tensor(ctx_out, meta);
  7557. }
  7558. std::ofstream fout(fname_out, std::ios::binary);
  7559. fout.exceptions(std::ofstream::failbit); // fail fast on write errors
  7560. const size_t meta_size = gguf_get_meta_size(ctx_out);
  7561. LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size);
  7562. // placeholder for the meta data
  7563. ::zeros(fout, meta_size);
  7564. for (int i = 0; i < ml.n_tensors; ++i) {
  7565. struct ggml_tensor * tensor = ml.get_tensor_meta(i);
  7566. const std::string name = ggml_get_name(tensor);
  7567. if (!ml.use_mmap) {
  7568. if (read_data.size() < ggml_nbytes(tensor)) {
  7569. read_data.resize(ggml_nbytes(tensor));
  7570. }
  7571. tensor->data = read_data.data();
  7572. }
  7573. ml.load_data_for(tensor);
  7574. LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
  7575. ++idx, ml.n_tensors,
  7576. ggml_get_name(tensor),
  7577. llama_format_tensor_shape(tensor).c_str(),
  7578. ggml_type_name(tensor->type));
  7579. // This used to be a regex, but <regex> has an extreme cost to compile times.
  7580. bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
  7581. // quantize only 2D tensors
  7582. quantize &= (ggml_n_dims(tensor) == 2);
  7583. quantize &= params->quantize_output_tensor || name != "output.weight";
  7584. quantize &= !params->only_copy;
  7585. // do not quantize expert gating tensors
  7586. quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
  7587. enum ggml_type new_type;
  7588. void * new_data;
  7589. size_t new_size;
  7590. if (quantize) {
  7591. new_type = quantized_type;
  7592. if (!params->pure) {
  7593. new_type = get_k_quant_type(qs, new_type, tensor, ftype);
  7594. }
  7595. // If we've decided to quantize to the same type the tensor is already
  7596. // in then there's nothing to do.
  7597. quantize = tensor->type != new_type;
  7598. }
  7599. if (!quantize) {
  7600. new_type = tensor->type;
  7601. new_data = tensor->data;
  7602. new_size = ggml_nbytes(tensor);
  7603. LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
  7604. } else {
  7605. const size_t nelements = ggml_nelements(tensor);
  7606. float * f32_data;
  7607. if (tensor->type == GGML_TYPE_F32) {
  7608. f32_data = (float *) tensor->data;
  7609. } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
  7610. throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
  7611. } else {
  7612. llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
  7613. f32_data = (float *) f32_conv_buf.data();
  7614. }
  7615. LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
  7616. fflush(stdout);
  7617. if (work.size() < nelements * 4) {
  7618. work.resize(nelements * 4); // upper bound on size
  7619. }
  7620. new_data = work.data();
  7621. std::array<int64_t, 1 << 4> hist_cur = {};
  7622. static const int chunk_size = 32 * 512;
  7623. const int nchunk = (nelements + chunk_size - 1)/chunk_size;
  7624. const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
  7625. if (nthread_use < 2) {
  7626. new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
  7627. } else {
  7628. size_t counter = 0;
  7629. new_size = 0;
  7630. auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
  7631. std::array<int64_t, 1 << 4> local_hist = {};
  7632. size_t local_size = 0;
  7633. while (true) {
  7634. std::unique_lock<std::mutex> lock(mutex);
  7635. size_t first = counter; counter += chunk_size;
  7636. if (first >= nelements) {
  7637. if (local_size > 0) {
  7638. for (int j=0; j<int(local_hist.size()); ++j) {
  7639. hist_cur[j] += local_hist[j];
  7640. }
  7641. new_size += local_size;
  7642. }
  7643. break;
  7644. }
  7645. lock.unlock();
  7646. size_t last = std::min(nelements, first + chunk_size);
  7647. local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
  7648. }
  7649. };
  7650. for (int it = 0; it < nthread_use - 1; ++it) {
  7651. workers.emplace_back(compute);
  7652. }
  7653. compute();
  7654. for (auto & w : workers) { w.join(); }
  7655. workers.clear();
  7656. }
  7657. LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
  7658. int64_t tot_count = 0;
  7659. for (size_t i = 0; i < hist_cur.size(); i++) {
  7660. hist_all[i] += hist_cur[i];
  7661. tot_count += hist_cur[i];
  7662. }
  7663. if (tot_count > 0) {
  7664. for (size_t i = 0; i < hist_cur.size(); i++) {
  7665. LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
  7666. }
  7667. }
  7668. LLAMA_LOG_INFO("\n");
  7669. }
  7670. total_size_org += ggml_nbytes(tensor);
  7671. total_size_new += new_size;
  7672. // update the gguf meta data as we go
  7673. gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
  7674. gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
  7675. // write tensor data + padding
  7676. fout.write((const char *) new_data, new_size);
  7677. zeros(fout, GGML_PAD(new_size, align) - new_size);
  7678. }
  7679. // go back to beginning of file and write the updated meta data
  7680. {
  7681. fout.seekp(0);
  7682. std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
  7683. gguf_get_meta_data(ctx_out, data.data());
  7684. fout.write((const char *) data.data(), data.size());
  7685. }
  7686. fout.close();
  7687. gguf_free(ctx_out);
  7688. LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
  7689. LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
  7690. // print histogram for all tensors
  7691. {
  7692. int64_t sum_all = 0;
  7693. for (size_t i = 0; i < hist_all.size(); i++) {
  7694. sum_all += hist_all[i];
  7695. }
  7696. if (sum_all > 0) {
  7697. LLAMA_LOG_INFO("%s: hist: ", __func__);
  7698. for (size_t i = 0; i < hist_all.size(); i++) {
  7699. LLAMA_LOG_INFO("%5.3f ", hist_all[i] / float(sum_all));
  7700. }
  7701. LLAMA_LOG_INFO("\n");
  7702. }
  7703. }
  7704. if (qs.n_fallback > 0) {
  7705. LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n",
  7706. __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
  7707. }
  7708. }
  7709. static int llama_apply_lora_from_file_internal(
  7710. const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
  7711. ) {
  7712. LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
  7713. const int64_t t_start_lora_us = ggml_time_us();
  7714. llama_file fin(path_lora, "rb");
  7715. // verify magic and version
  7716. {
  7717. uint32_t magic = fin.read_u32();
  7718. if (magic != LLAMA_FILE_MAGIC_GGLA) {
  7719. LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
  7720. return 1;
  7721. }
  7722. uint32_t format_version = fin.read_u32();
  7723. if (format_version != 1) {
  7724. LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
  7725. return 1;
  7726. }
  7727. }
  7728. int32_t lora_r = fin.read_u32();
  7729. int32_t lora_alpha = fin.read_u32();
  7730. float scaling = scale * (float)lora_alpha / (float)lora_r;
  7731. LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
  7732. // create a name -> tensor map of the model to accelerate lookups
  7733. // find the max tensor size to estimate the required temporary buffer size
  7734. size_t max_tensor_size = 0;
  7735. std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
  7736. for (const auto & kv : model.tensors_by_name) {
  7737. model_tensors.insert(kv);
  7738. size_t f32_size = ggml_nelements(kv.second) * sizeof(float);
  7739. max_tensor_size = std::max(max_tensor_size, f32_size);
  7740. }
  7741. // create a temporary ggml context to store the lora tensors
  7742. // TODO: use ggml-alloc
  7743. size_t lora_ctx_size = max_tensor_size * 3;
  7744. LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0);
  7745. std::vector<uint8_t> lora_buf(lora_ctx_size);
  7746. struct ggml_init_params params;
  7747. params.mem_size = lora_buf.size();
  7748. params.mem_buffer = lora_buf.data();
  7749. params.no_alloc = false;
  7750. using unique_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>;
  7751. unique_context lora_ctx(nullptr, ggml_free);
  7752. lora_ctx.reset(ggml_init(params));
  7753. std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
  7754. // load base model
  7755. std::unique_ptr<llama_model_loader> ml;
  7756. if (path_base_model) {
  7757. LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
  7758. ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
  7759. ml->init_mapping(false); // no prefetching
  7760. }
  7761. // read tensors and apply
  7762. bool warned = false;
  7763. int n_tensors = 0;
  7764. std::vector<uint8_t> work_buffer;
  7765. while (true) {
  7766. if (fin.tell() == fin.size) {
  7767. // eof
  7768. break;
  7769. }
  7770. int32_t n_dims;
  7771. int32_t name_len;
  7772. int32_t ftype;
  7773. fin.read_raw(&n_dims, sizeof(n_dims));
  7774. fin.read_raw(&name_len, sizeof(name_len));
  7775. fin.read_raw(&ftype, sizeof(ftype));
  7776. if (n_dims != 1 && n_dims != 2) {
  7777. LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
  7778. return 1;
  7779. }
  7780. int32_t ne[2] = { 1, 1 };
  7781. for (int i = 0; i < n_dims; ++i) {
  7782. fin.read_raw(&ne[i], sizeof(ne[i]));
  7783. }
  7784. std::string name;
  7785. {
  7786. GGML_ASSERT(name_len <= 1024);
  7787. char buf[1024];
  7788. fin.read_raw(buf, name_len);
  7789. name = std::string(buf, name_len);
  7790. }
  7791. // check for lora suffix and get the type of tensor
  7792. const std::string lora_suffix = ".lora";
  7793. size_t pos = name.rfind(lora_suffix);
  7794. if (pos == std::string::npos) {
  7795. LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
  7796. return 1;
  7797. }
  7798. std::string lora_type = name.substr(pos + lora_suffix.length());
  7799. std::string base_name = name;
  7800. base_name.erase(pos);
  7801. // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str());
  7802. if (model_tensors.find(base_name) == model_tensors.end()) {
  7803. LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
  7804. return 1;
  7805. }
  7806. // create ggml tensor
  7807. ggml_type wtype;
  7808. switch (ftype) {
  7809. case 0: wtype = GGML_TYPE_F32; break;
  7810. case 1: wtype = GGML_TYPE_F16; break;
  7811. default:
  7812. {
  7813. LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
  7814. __func__, ftype);
  7815. return false;
  7816. }
  7817. }
  7818. ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]);
  7819. ggml_set_name(lora_tensor, name.c_str());
  7820. // load tensor data
  7821. size_t offset = fin.tell();
  7822. size_t tensor_data_size = ggml_nbytes(lora_tensor);
  7823. offset = (offset + 31) & -32;
  7824. fin.seek(offset, SEEK_SET);
  7825. fin.read_raw(lora_tensor->data, tensor_data_size);
  7826. lora_tensors[name] = lora_tensor;
  7827. // check if we have both A and B tensors and apply
  7828. if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
  7829. lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
  7830. ggml_tensor * dest_t = model_tensors[base_name];
  7831. offload_func_t offload_func = ggml_offload_nop;
  7832. offload_func_t offload_func_force_inplace = ggml_offload_nop;
  7833. #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
  7834. if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
  7835. if (dest_t->type != GGML_TYPE_F16) {
  7836. throw std::runtime_error(format(
  7837. "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models. dest_t->type: %d", __func__, dest_t->type));
  7838. }
  7839. offload_func = ggml_cuda_assign_buffers;
  7840. offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
  7841. }
  7842. #endif // GGML_USE_CUBLAS
  7843. ggml_tensor * base_t;
  7844. if (ml) {
  7845. struct gguf_context * ctx_gguf = ml->ctx_gguf;
  7846. // load from base model
  7847. if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
  7848. LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
  7849. return 1;
  7850. }
  7851. base_t = ml->get_tensor_meta(base_name.c_str());
  7852. ml->load_data_for(base_t);
  7853. } else {
  7854. base_t = dest_t;
  7855. }
  7856. if (ggml_is_quantized(base_t->type)) {
  7857. if (!warned) {
  7858. LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
  7859. "use a f16 or f32 base model with --lora-base\n", __func__);
  7860. warned = true;
  7861. }
  7862. }
  7863. ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
  7864. GGML_ASSERT(loraA->type == GGML_TYPE_F32);
  7865. ggml_set_name(loraA, "loraA");
  7866. ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
  7867. GGML_ASSERT(loraB->type == GGML_TYPE_F32);
  7868. ggml_set_name(loraB, "loraB");
  7869. if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
  7870. LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
  7871. " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
  7872. return 1;
  7873. }
  7874. // w = w + BA*s
  7875. ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB);
  7876. offload_func(BA);
  7877. ggml_set_name(BA, "BA");
  7878. if (scaling != 1.0f) {
  7879. BA = ggml_scale_inplace(lora_ctx.get(), BA, scaling);
  7880. offload_func(BA);
  7881. ggml_set_name(BA, "BA_scaled");
  7882. }
  7883. ggml_tensor * r;
  7884. if (base_t == dest_t) {
  7885. r = ggml_add_inplace(lora_ctx.get(), dest_t, BA);
  7886. offload_func_force_inplace(r);
  7887. ggml_set_name(r, "r_add_inplace");
  7888. }
  7889. else {
  7890. r = ggml_add(lora_ctx.get(), base_t, BA);
  7891. offload_func(r);
  7892. ggml_set_name(r, "r_add");
  7893. r = ggml_cpy(lora_ctx.get(), r, dest_t);
  7894. offload_func(r);
  7895. ggml_set_name(r, "r_cpy");
  7896. }
  7897. struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get());
  7898. ggml_build_forward_expand(gf, r);
  7899. ggml_graph_compute_helper(work_buffer, gf, n_threads);
  7900. // the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other
  7901. GGML_ASSERT(lora_tensors.size() == 2);
  7902. // we won't need these tensors again, reset the context to save memory
  7903. lora_ctx.reset(ggml_init(params));
  7904. lora_tensors.clear();
  7905. n_tensors++;
  7906. if (n_tensors % 4 == 0) {
  7907. LLAMA_LOG_INFO(".");
  7908. }
  7909. }
  7910. }
  7911. const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
  7912. LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
  7913. return 0;
  7914. }
  7915. //
  7916. // interface implementation
  7917. //
  7918. struct llama_model_params llama_model_default_params() {
  7919. struct llama_model_params result = {
  7920. /*.n_gpu_layers =*/ 0,
  7921. /*.main_gpu =*/ 0,
  7922. /*.tensor_split =*/ nullptr,
  7923. /*.progress_callback =*/ nullptr,
  7924. /*.progress_callback_user_data =*/ nullptr,
  7925. /*.kv_overrides =*/ nullptr,
  7926. /*.vocab_only =*/ false,
  7927. /*.use_mmap =*/ true,
  7928. /*.use_mlock =*/ false,
  7929. };
  7930. #ifdef GGML_USE_METAL
  7931. result.n_gpu_layers = 1;
  7932. #endif
  7933. return result;
  7934. }
  7935. struct llama_context_params llama_context_default_params() {
  7936. struct llama_context_params result = {
  7937. /*.seed =*/ LLAMA_DEFAULT_SEED,
  7938. /*.n_ctx =*/ 512,
  7939. /*.n_batch =*/ 512,
  7940. /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
  7941. /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
  7942. /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
  7943. /*.rope_freq_base =*/ 0.0f,
  7944. /*.rope_freq_scale =*/ 0.0f,
  7945. /*.yarn_ext_factor =*/ -1.0f,
  7946. /*.yarn_attn_factor =*/ 1.0f,
  7947. /*.yarn_beta_fast =*/ 32.0f,
  7948. /*.yarn_beta_slow =*/ 1.0f,
  7949. /*.yarn_orig_ctx =*/ 0,
  7950. /*.type_k =*/ GGML_TYPE_F16,
  7951. /*.type_v =*/ GGML_TYPE_F16,
  7952. /*.mul_mat_q =*/ true,
  7953. /*.logits_all =*/ false,
  7954. /*.embedding =*/ false,
  7955. /*.offload_kqv =*/ true,
  7956. };
  7957. return result;
  7958. }
  7959. struct llama_model_quantize_params llama_model_quantize_default_params() {
  7960. struct llama_model_quantize_params result = {
  7961. /*.nthread =*/ 0,
  7962. /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
  7963. /*.allow_requantize =*/ false,
  7964. /*.quantize_output_tensor =*/ true,
  7965. /*.only_copy =*/ false,
  7966. /*.pure =*/ false,
  7967. };
  7968. return result;
  7969. }
  7970. int32_t llama_max_devices(void) {
  7971. return LLAMA_MAX_DEVICES;
  7972. }
  7973. bool llama_mmap_supported(void) {
  7974. return llama_mmap::SUPPORTED;
  7975. }
  7976. bool llama_mlock_supported(void) {
  7977. return llama_mlock::SUPPORTED;
  7978. }
  7979. void llama_backend_init(bool numa) {
  7980. ggml_time_init();
  7981. // needed to initialize f16 tables
  7982. {
  7983. struct ggml_init_params params = { 0, NULL, false };
  7984. struct ggml_context * ctx = ggml_init(params);
  7985. ggml_free(ctx);
  7986. }
  7987. if (numa) {
  7988. ggml_numa_init();
  7989. }
  7990. #ifdef GGML_USE_MPI
  7991. ggml_mpi_backend_init();
  7992. #endif
  7993. }
  7994. void llama_backend_free(void) {
  7995. #ifdef GGML_USE_MPI
  7996. ggml_mpi_backend_free();
  7997. #endif
  7998. }
  7999. int64_t llama_time_us(void) {
  8000. return ggml_time_us();
  8001. }
  8002. struct llama_model * llama_load_model_from_file(
  8003. const char * path_model,
  8004. struct llama_model_params params) {
  8005. ggml_time_init();
  8006. llama_model * model = new llama_model;
  8007. unsigned cur_percentage = 0;
  8008. if (params.progress_callback == NULL) {
  8009. params.progress_callback_user_data = &cur_percentage;
  8010. params.progress_callback = [](float progress, void * ctx) {
  8011. unsigned * cur_percentage_p = (unsigned *) ctx;
  8012. unsigned percentage = (unsigned) (100 * progress);
  8013. while (percentage > *cur_percentage_p) {
  8014. *cur_percentage_p = percentage;
  8015. LLAMA_LOG_INFO(".");
  8016. if (percentage >= 100) {
  8017. LLAMA_LOG_INFO("\n");
  8018. }
  8019. }
  8020. return true;
  8021. };
  8022. }
  8023. int status = llama_model_load(path_model, *model, params);
  8024. GGML_ASSERT(status <= 0);
  8025. if (status < 0) {
  8026. if (status == -1) {
  8027. LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
  8028. } else if (status == -2) {
  8029. LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
  8030. }
  8031. delete model;
  8032. return nullptr;
  8033. }
  8034. return model;
  8035. }
  8036. void llama_free_model(struct llama_model * model) {
  8037. delete model;
  8038. }
  8039. struct llama_context * llama_new_context_with_model(
  8040. struct llama_model * model,
  8041. struct llama_context_params params) {
  8042. if (!model) {
  8043. return nullptr;
  8044. }
  8045. llama_context * ctx = new llama_context(*model);
  8046. const auto & hparams = model->hparams;
  8047. auto & cparams = ctx->cparams;
  8048. cparams.n_batch = params.n_batch;
  8049. cparams.n_threads = params.n_threads;
  8050. cparams.n_threads_batch = params.n_threads_batch;
  8051. cparams.yarn_ext_factor = params.yarn_ext_factor;
  8052. cparams.yarn_attn_factor = params.yarn_attn_factor;
  8053. cparams.yarn_beta_fast = params.yarn_beta_fast;
  8054. cparams.yarn_beta_slow = params.yarn_beta_slow;
  8055. cparams.mul_mat_q = params.mul_mat_q;
  8056. cparams.offload_kqv = params.offload_kqv;
  8057. cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
  8058. cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
  8059. cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
  8060. cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
  8061. hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
  8062. hparams.n_ctx_train;
  8063. auto rope_scaling_type = params.rope_scaling_type;
  8064. if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
  8065. rope_scaling_type = hparams.rope_scaling_type_train;
  8066. }
  8067. if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) {
  8068. cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
  8069. }
  8070. if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
  8071. cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
  8072. }
  8073. if (params.seed == LLAMA_DEFAULT_SEED) {
  8074. params.seed = time(NULL);
  8075. }
  8076. LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
  8077. LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
  8078. LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
  8079. ctx->rng = std::mt19937(params.seed);
  8080. ctx->logits_all = params.logits_all;
  8081. const ggml_type type_k = params.type_k;
  8082. const ggml_type type_v = params.type_v;
  8083. GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
  8084. GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
  8085. // reserve memory for context buffers
  8086. if (!hparams.vocab_only) {
  8087. // initialize backend
  8088. #ifdef GGML_USE_METAL
  8089. if (model->n_gpu_layers > 0) {
  8090. ctx->backend = ggml_backend_metal_init();
  8091. if (ctx->backend == nullptr) {
  8092. LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
  8093. }
  8094. }
  8095. #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
  8096. // for testing only
  8097. if (model->n_gpu_layers > 0) {
  8098. ctx->backend = ggml_backend_cuda_init(0);
  8099. if (ctx->backend == nullptr) {
  8100. LLAMA_LOG_ERROR("%s: failed to initialize CUDA backend\n", __func__);
  8101. }
  8102. }
  8103. #endif
  8104. if (ctx->backend == nullptr && ggml_backend_buffer_is_host(model->buf)) {
  8105. ctx->backend = ggml_backend_cpu_init();
  8106. if (ctx->backend == nullptr) {
  8107. LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
  8108. }
  8109. }
  8110. if (ctx->backend == nullptr) {
  8111. LLAMA_LOG_ERROR("%s: failed to initialize a backend\n", __func__);
  8112. delete ctx;
  8113. return nullptr;
  8114. }
  8115. if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v,
  8116. cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
  8117. LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
  8118. llama_free(ctx);
  8119. return nullptr;
  8120. }
  8121. {
  8122. size_t memory_size_k = 0;
  8123. size_t memory_size_v = 0;
  8124. for (auto & k : ctx->kv_self.k_l) {
  8125. memory_size_k += ggml_nbytes(k);
  8126. }
  8127. for (auto & v : ctx->kv_self.v_l) {
  8128. memory_size_v += ggml_nbytes(v);
  8129. }
  8130. LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
  8131. (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
  8132. ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
  8133. ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
  8134. }
  8135. // resized during inference
  8136. if (params.logits_all) {
  8137. ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);
  8138. } else {
  8139. ctx->logits.reserve(hparams.n_vocab);
  8140. }
  8141. if (params.embedding){
  8142. ctx->embedding.resize(hparams.n_embd);
  8143. }
  8144. {
  8145. // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
  8146. ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
  8147. // create measure allocator
  8148. ctx->alloc = ggml_allocr_new_measure_from_backend(ctx->backend);
  8149. // build worst-case graph
  8150. int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
  8151. int n_past = cparams.n_ctx - n_tokens;
  8152. llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
  8153. ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
  8154. // measure memory requirements for the graph
  8155. size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf);
  8156. LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute_meta.size() + alloc_size) / 1024.0 / 1024.0);
  8157. // create allocator again with exact memory requirements
  8158. ggml_allocr_free(ctx->alloc);
  8159. ctx->buf_alloc = ggml_backend_alloc_buffer(ctx->backend, alloc_size);
  8160. ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc);
  8161. #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
  8162. if (model->n_gpu_layers > 0) {
  8163. // the CPU buffer adds this padding in case the malloc buffer is not aligned, so we need to do the same for the GPU buffer, since we use the same offsets
  8164. ggml_cuda_set_scratch_size(alloc_size + 64);
  8165. LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
  8166. // calculate total VRAM usage
  8167. auto add_tensor = [](const ggml_tensor * t, size_t & size) {
  8168. if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
  8169. size += ggml_nbytes(t);
  8170. }
  8171. };
  8172. size_t model_vram_size = 0;
  8173. for (const auto & kv : model->tensors_by_name) {
  8174. add_tensor(kv.second, model_vram_size);
  8175. }
  8176. size_t kv_vram_size = 0;
  8177. for (auto & k : ctx->kv_self.k_l) {
  8178. add_tensor(k, kv_vram_size);
  8179. }
  8180. for (auto & v : ctx->kv_self.v_l) {
  8181. add_tensor(v, kv_vram_size);
  8182. }
  8183. size_t ctx_vram_size = alloc_size + kv_vram_size;
  8184. size_t total_vram_size = model_vram_size + ctx_vram_size;
  8185. LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
  8186. total_vram_size / 1024.0 / 1024.0,
  8187. model_vram_size / 1024.0 / 1024.0,
  8188. ctx_vram_size / 1024.0 / 1024.0);
  8189. }
  8190. #endif
  8191. }
  8192. }
  8193. #ifdef GGML_USE_MPI
  8194. ctx->ctx_mpi = ggml_mpi_init();
  8195. if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
  8196. // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
  8197. // TODO: needs fix after #3228
  8198. GGML_ASSERT(false && "not implemented");
  8199. //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
  8200. //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
  8201. llama_backend_free();
  8202. exit(1);
  8203. }
  8204. #endif
  8205. return ctx;
  8206. }
  8207. void llama_free(struct llama_context * ctx) {
  8208. delete ctx;
  8209. }
  8210. const llama_model * llama_get_model(const struct llama_context * ctx) {
  8211. return &ctx->model;
  8212. }
  8213. uint32_t llama_n_ctx(const struct llama_context * ctx) {
  8214. return ctx->cparams.n_ctx;
  8215. }
  8216. uint32_t llama_n_batch(const struct llama_context * ctx) {
  8217. return ctx->cparams.n_batch;
  8218. }
  8219. enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
  8220. return model->vocab.type;
  8221. }
  8222. int32_t llama_n_vocab(const struct llama_model * model) {
  8223. return model->vocab.id_to_token.size();
  8224. }
  8225. int32_t llama_n_ctx_train(const struct llama_model * model) {
  8226. return model->hparams.n_ctx_train;
  8227. }
  8228. int32_t llama_n_embd(const struct llama_model * model) {
  8229. return model->hparams.n_embd;
  8230. }
  8231. float llama_rope_freq_scale_train(const struct llama_model * model) {
  8232. return model->hparams.rope_freq_scale_train;
  8233. }
  8234. int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
  8235. const auto & it = model->gguf_kv.find(key);
  8236. if (it == model->gguf_kv.end()) {
  8237. if (buf_size > 0) {
  8238. buf[0] = '\0';
  8239. }
  8240. return -1;
  8241. }
  8242. return snprintf(buf, buf_size, "%s", it->second.c_str());
  8243. }
  8244. int32_t llama_model_meta_count(const struct llama_model * model) {
  8245. return (int)model->gguf_kv.size();
  8246. }
  8247. int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
  8248. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  8249. if (buf_size > 0) {
  8250. buf[0] = '\0';
  8251. }
  8252. return -1;
  8253. }
  8254. auto it = model->gguf_kv.begin();
  8255. std::advance(it, i);
  8256. return snprintf(buf, buf_size, "%s", it->first.c_str());
  8257. }
  8258. int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
  8259. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  8260. if (buf_size > 0) {
  8261. buf[0] = '\0';
  8262. }
  8263. return -1;
  8264. }
  8265. auto it = model->gguf_kv.begin();
  8266. std::advance(it, i);
  8267. return snprintf(buf, buf_size, "%s", it->second.c_str());
  8268. }
  8269. int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
  8270. return snprintf(buf, buf_size, "%s %s%s %s",
  8271. llama_model_arch_name(model->arch).c_str(),
  8272. model->hparams.n_expert > 0 ? (std::to_string(model->hparams.n_expert) + "x").c_str() : "",
  8273. llama_model_type_name(model->type),
  8274. llama_model_ftype_name(model->ftype).c_str());
  8275. }
  8276. uint64_t llama_model_size(const struct llama_model * model) {
  8277. uint64_t size = 0;
  8278. for (const auto & it : model->tensors_by_name) {
  8279. size += ggml_nbytes(it.second);
  8280. }
  8281. return size;
  8282. }
  8283. uint64_t llama_model_n_params(const struct llama_model * model) {
  8284. uint64_t nparams = 0;
  8285. for (const auto & it : model->tensors_by_name) {
  8286. nparams += ggml_nelements(it.second);
  8287. }
  8288. return nparams;
  8289. }
  8290. struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
  8291. return ggml_get_tensor(model->ctx, name);
  8292. }
  8293. uint32_t llama_model_quantize(
  8294. const char * fname_inp,
  8295. const char * fname_out,
  8296. const llama_model_quantize_params * params) {
  8297. try {
  8298. llama_model_quantize_internal(fname_inp, fname_out, params);
  8299. return 0;
  8300. } catch (const std::exception & err) {
  8301. LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
  8302. return 1;
  8303. }
  8304. }
  8305. int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
  8306. try {
  8307. return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
  8308. } catch (const std::exception & err) {
  8309. LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
  8310. return 1;
  8311. }
  8312. }
  8313. int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
  8314. try {
  8315. return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
  8316. } catch (const std::exception & err) {
  8317. LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
  8318. return 1;
  8319. }
  8320. }
  8321. struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
  8322. struct llama_kv_cache_view result = {
  8323. /*.n_cells = */ 0,
  8324. /*.n_max_seq = */ n_max_seq,
  8325. /*.token_count = */ 0,
  8326. /*.used_cells = */ llama_get_kv_cache_used_cells(ctx),
  8327. /*.max_contiguous = */ 0,
  8328. /*.max_contiguous_idx = */ -1,
  8329. /*.cells = */ nullptr,
  8330. /*.cells_sequences = */ nullptr,
  8331. };
  8332. return result;
  8333. }
  8334. void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
  8335. if (view->cells != nullptr) {
  8336. free(view->cells);
  8337. view->cells = nullptr;
  8338. }
  8339. if (view->cells_sequences != nullptr) {
  8340. free(view->cells_sequences);
  8341. view->cells_sequences = nullptr;
  8342. }
  8343. }
  8344. void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
  8345. if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) {
  8346. view->n_cells = int32_t(ctx->kv_self.size);
  8347. void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
  8348. GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
  8349. view->cells = (struct llama_kv_cache_view_cell *)p;
  8350. p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells);
  8351. GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
  8352. view->cells_sequences = (llama_seq_id *)p;
  8353. }
  8354. const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
  8355. llama_kv_cache_view_cell * c_curr = view->cells;
  8356. llama_seq_id * cs_curr = view->cells_sequences;
  8357. int32_t used_cells = 0;
  8358. int32_t token_count = 0;
  8359. int32_t curr_contig_idx = -1;
  8360. uint32_t max_contig = 0;
  8361. int32_t max_contig_idx = -1;
  8362. for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) {
  8363. const size_t curr_size = kv_cells[i].seq_id.size();
  8364. token_count += curr_size;
  8365. c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
  8366. if (curr_size > 0) {
  8367. if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
  8368. max_contig = i - curr_contig_idx;
  8369. max_contig_idx = curr_contig_idx;
  8370. }
  8371. curr_contig_idx = -1;
  8372. } else if (curr_contig_idx < 0) {
  8373. curr_contig_idx = i;
  8374. }
  8375. int seq_idx = 0;
  8376. for (const llama_seq_id it : kv_cells[i].seq_id) {
  8377. if (seq_idx >= view->n_max_seq) {
  8378. break;
  8379. }
  8380. cs_curr[seq_idx] = it;
  8381. seq_idx++;
  8382. }
  8383. if (seq_idx != 0) {
  8384. used_cells++;
  8385. }
  8386. for (; seq_idx < view->n_max_seq; seq_idx++) {
  8387. cs_curr[seq_idx] = -1;
  8388. }
  8389. }
  8390. if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
  8391. max_contig_idx = curr_contig_idx;
  8392. max_contig = kv_cells.size() - curr_contig_idx;
  8393. }
  8394. view->max_contiguous = max_contig;
  8395. view->max_contiguous_idx = max_contig_idx;
  8396. view->token_count = token_count;
  8397. view->used_cells = used_cells;
  8398. if (uint32_t(used_cells) != ctx->kv_self.used) {
  8399. LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
  8400. __func__, ctx->kv_self.used, used_cells);
  8401. }
  8402. }
  8403. int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) {
  8404. int result = 0;
  8405. for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
  8406. result += ctx->kv_self.cells[i].seq_id.size();
  8407. }
  8408. return result;
  8409. }
  8410. int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
  8411. return ctx->kv_self.used;
  8412. }
  8413. void llama_kv_cache_clear(struct llama_context * ctx) {
  8414. llama_kv_cache_clear(ctx->kv_self);
  8415. }
  8416. void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
  8417. llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
  8418. }
  8419. void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
  8420. if (seq_id_src == seq_id_dst) {
  8421. return;
  8422. }
  8423. llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
  8424. }
  8425. void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
  8426. llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
  8427. }
  8428. void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
  8429. if (delta == 0) {
  8430. return;
  8431. }
  8432. llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
  8433. }
  8434. void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
  8435. if (d == 1) {
  8436. return;
  8437. }
  8438. llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
  8439. }
  8440. // Returns the *maximum* size of the state
  8441. size_t llama_get_state_size(const struct llama_context * ctx) {
  8442. // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
  8443. // for reference, std::mt19937(1337) serializes to 6701 bytes.
  8444. const size_t s_rng_size = sizeof(size_t);
  8445. const size_t s_rng = LLAMA_MAX_RNG_STATE;
  8446. const size_t s_logits_capacity = sizeof(size_t);
  8447. const size_t s_logits_size = sizeof(size_t);
  8448. const size_t s_logits = ctx->logits.capacity() * sizeof(float);
  8449. const size_t s_embedding_size = sizeof(size_t);
  8450. const size_t s_embedding = ctx->embedding.size() * sizeof(float);
  8451. const size_t s_kv_size = sizeof(size_t);
  8452. const size_t s_kv_ntok = sizeof(int);
  8453. const size_t s_kv = ggml_backend_buffer_get_size(ctx->kv_self.buf);
  8454. const size_t s_total = (
  8455. + s_rng_size
  8456. + s_rng
  8457. + s_logits_capacity
  8458. + s_logits_size
  8459. + s_logits
  8460. + s_embedding_size
  8461. + s_embedding
  8462. + s_kv_size
  8463. + s_kv_ntok
  8464. + s_kv
  8465. );
  8466. return s_total;
  8467. }
  8468. // llama_context_data
  8469. struct llama_data_context {
  8470. virtual void write(const void * src, size_t size) = 0;
  8471. virtual size_t get_size_written() = 0;
  8472. virtual ~llama_data_context() = default;
  8473. };
  8474. struct llama_data_buffer_context : llama_data_context {
  8475. uint8_t * ptr;
  8476. size_t size_written = 0;
  8477. llama_data_buffer_context(uint8_t * p) : ptr(p) {}
  8478. void write(const void * src, size_t size) override {
  8479. memcpy(ptr, src, size);
  8480. ptr += size;
  8481. size_written += size;
  8482. }
  8483. size_t get_size_written() override {
  8484. return size_written;
  8485. }
  8486. };
  8487. struct llama_data_file_context : llama_data_context {
  8488. llama_file * file;
  8489. size_t size_written = 0;
  8490. llama_data_file_context(llama_file * f) : file(f) {}
  8491. void write(const void * src, size_t size) override {
  8492. file->write_raw(src, size);
  8493. size_written += size;
  8494. }
  8495. size_t get_size_written() override {
  8496. return size_written;
  8497. }
  8498. };
  8499. /** copy state data into either a buffer or file depending on the passed in context
  8500. *
  8501. * file context:
  8502. * llama_file file("/path", "wb");
  8503. * llama_data_file_context data_ctx(&file);
  8504. * llama_copy_state_data(ctx, &data_ctx);
  8505. *
  8506. * buffer context:
  8507. * std::vector<uint8_t> buf(max_size, 0);
  8508. * llama_data_buffer_context data_ctx(&buf.data());
  8509. * llama_copy_state_data(ctx, &data_ctx);
  8510. *
  8511. */
  8512. static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
  8513. // copy rng
  8514. {
  8515. std::stringstream rng_ss;
  8516. rng_ss << ctx->rng;
  8517. const size_t rng_size = rng_ss.str().size();
  8518. char rng_buf[LLAMA_MAX_RNG_STATE];
  8519. memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
  8520. memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
  8521. data_ctx->write(&rng_size, sizeof(rng_size));
  8522. data_ctx->write(&rng_buf[0], LLAMA_MAX_RNG_STATE);
  8523. }
  8524. // copy logits
  8525. {
  8526. const size_t logits_cap = ctx->logits.capacity();
  8527. const size_t logits_size = ctx->logits.size();
  8528. data_ctx->write(&logits_cap, sizeof(logits_cap));
  8529. data_ctx->write(&logits_size, sizeof(logits_size));
  8530. if (logits_size) {
  8531. data_ctx->write(ctx->logits.data(), logits_size * sizeof(float));
  8532. }
  8533. // If there is a gap between the size and the capacity, write padding
  8534. size_t padding_size = (logits_cap - logits_size) * sizeof(float);
  8535. if (padding_size > 0) {
  8536. std::vector<uint8_t> padding(padding_size, 0); // Create a buffer filled with zeros
  8537. data_ctx->write(padding.data(), padding_size);
  8538. }
  8539. }
  8540. // copy embeddings
  8541. {
  8542. const size_t embedding_size = ctx->embedding.size();
  8543. data_ctx->write(&embedding_size, sizeof(embedding_size));
  8544. if (embedding_size) {
  8545. data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float));
  8546. }
  8547. }
  8548. // copy kv cache
  8549. {
  8550. const auto & kv_self = ctx->kv_self;
  8551. const auto & hparams = ctx->model.hparams;
  8552. const auto & cparams = ctx->cparams;
  8553. const auto n_layer = hparams.n_layer;
  8554. const auto n_embd_k_gqa = hparams.n_embd_k_gqa();
  8555. const auto n_embd_v_gqa = hparams.n_embd_v_gqa();
  8556. const auto n_ctx = cparams.n_ctx;
  8557. const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf);
  8558. const uint32_t kv_head = kv_self.head;
  8559. const uint32_t kv_size = kv_self.size;
  8560. const uint32_t kv_used = kv_self.used;
  8561. data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
  8562. data_ctx->write(&kv_head, sizeof(kv_head));
  8563. data_ctx->write(&kv_size, sizeof(kv_size));
  8564. data_ctx->write(&kv_used, sizeof(kv_used));
  8565. if (kv_buf_size) {
  8566. const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
  8567. ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
  8568. ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
  8569. std::vector<struct ggml_tensor *> kout2d(n_layer);
  8570. std::vector<struct ggml_tensor *> vout2d(n_layer);
  8571. for (int il = 0; il < (int) n_layer; ++il) {
  8572. kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
  8573. vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
  8574. ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
  8575. n_embd_k_gqa, kv_head,
  8576. elt_size*n_embd_k_gqa, 0);
  8577. ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
  8578. kv_head, n_embd_v_gqa,
  8579. elt_size*n_ctx, 0);
  8580. ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
  8581. ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d[il]));
  8582. }
  8583. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
  8584. ggml_backend_graph_compute(ctx->backend, gf);
  8585. std::vector<uint8_t> tmp_buf;
  8586. for (int il = 0; il < (int) n_layer; ++il) {
  8587. tmp_buf.resize(ggml_nbytes(kout2d[il]));
  8588. ggml_backend_tensor_get(kout2d[il], tmp_buf.data(), 0, tmp_buf.size());
  8589. data_ctx->write(tmp_buf.data(), tmp_buf.size());
  8590. tmp_buf.resize(ggml_nbytes(vout2d[il]));
  8591. ggml_backend_tensor_get(vout2d[il], tmp_buf.data(), 0, tmp_buf.size());
  8592. data_ctx->write(tmp_buf.data(), tmp_buf.size());
  8593. }
  8594. ggml_free(cpy_ctx);
  8595. ggml_backend_buffer_free(buf);
  8596. }
  8597. for (uint32_t i = 0; i < kv_size; ++i) {
  8598. const auto & cell = kv_self.cells[i];
  8599. const llama_pos pos = cell.pos;
  8600. const size_t seq_id_size = cell.seq_id.size();
  8601. data_ctx->write(&pos, sizeof(pos));
  8602. data_ctx->write(&seq_id_size, sizeof(seq_id_size));
  8603. for (auto seq_id : cell.seq_id) {
  8604. data_ctx->write(&seq_id, sizeof(seq_id));
  8605. }
  8606. }
  8607. }
  8608. }
  8609. size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
  8610. llama_data_buffer_context data_ctx(dst);
  8611. llama_copy_state_data_internal(ctx, &data_ctx);
  8612. return data_ctx.get_size_written();
  8613. }
  8614. // Sets the state reading from the specified source address
  8615. size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
  8616. uint8_t * inp = src;
  8617. // set rng
  8618. {
  8619. size_t rng_size;
  8620. char rng_buf[LLAMA_MAX_RNG_STATE];
  8621. memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
  8622. memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
  8623. std::stringstream rng_ss;
  8624. rng_ss.str(std::string(&rng_buf[0], rng_size));
  8625. rng_ss >> ctx->rng;
  8626. GGML_ASSERT(!rng_ss.fail());
  8627. }
  8628. // set logits
  8629. {
  8630. size_t logits_cap;
  8631. size_t logits_size;
  8632. memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
  8633. memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
  8634. GGML_ASSERT(ctx->logits.capacity() == logits_cap);
  8635. if (logits_size) {
  8636. ctx->logits.resize(logits_size);
  8637. memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
  8638. }
  8639. inp += logits_cap * sizeof(float);
  8640. }
  8641. // set embeddings
  8642. {
  8643. size_t embedding_size;
  8644. memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
  8645. GGML_ASSERT(ctx->embedding.capacity() == embedding_size);
  8646. if (embedding_size) {
  8647. memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
  8648. inp += embedding_size * sizeof(float);
  8649. }
  8650. }
  8651. // set kv cache
  8652. {
  8653. const auto & kv_self = ctx->kv_self;
  8654. const auto & hparams = ctx->model.hparams;
  8655. const auto & cparams = ctx->cparams;
  8656. const int n_layer = hparams.n_layer;
  8657. const int n_embd_k_gqa = hparams.n_embd_k_gqa();
  8658. const int n_embd_v_gqa = hparams.n_embd_v_gqa();
  8659. const int n_ctx = cparams.n_ctx;
  8660. size_t kv_buf_size;
  8661. uint32_t kv_head;
  8662. uint32_t kv_size;
  8663. uint32_t kv_used;
  8664. memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
  8665. memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
  8666. memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
  8667. memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
  8668. if (kv_buf_size) {
  8669. GGML_ASSERT(ggml_backend_buffer_get_size(kv_self.buf) == kv_buf_size);
  8670. const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
  8671. ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
  8672. ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
  8673. std::vector<struct ggml_tensor *> kin2d(n_layer);
  8674. std::vector<struct ggml_tensor *> vin2d(n_layer);
  8675. for (int il = 0; il < n_layer; ++il) {
  8676. kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
  8677. vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
  8678. ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
  8679. n_embd_k_gqa, kv_head,
  8680. elt_size*n_embd_k_gqa, 0);
  8681. ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
  8682. kv_head, n_embd_v_gqa,
  8683. elt_size*n_ctx, 0);
  8684. ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
  8685. ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d[il], v2d));
  8686. }
  8687. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
  8688. // load data into the tensors
  8689. for (int il = 0; il < n_layer; ++il) {
  8690. ggml_backend_tensor_set(kin2d[il], inp, 0, ggml_nbytes(kin2d[il]));
  8691. inp += ggml_nbytes(kin2d[il]);
  8692. ggml_backend_tensor_set(vin2d[il], inp, 0, ggml_nbytes(vin2d[il]));
  8693. inp += ggml_nbytes(vin2d[il]);
  8694. }
  8695. ggml_backend_graph_compute(ctx->backend, gf);
  8696. ggml_free(cpy_ctx);
  8697. ggml_backend_buffer_free(buf);
  8698. }
  8699. ctx->kv_self.head = kv_head;
  8700. ctx->kv_self.size = kv_size;
  8701. ctx->kv_self.used = kv_used;
  8702. ctx->kv_self.cells.resize(kv_size);
  8703. for (uint32_t i = 0; i < kv_size; ++i) {
  8704. llama_pos pos;
  8705. size_t seq_id_size;
  8706. memcpy(&pos, inp, sizeof(pos)); inp += sizeof(pos);
  8707. memcpy(&seq_id_size, inp, sizeof(seq_id_size)); inp += sizeof(seq_id_size);
  8708. ctx->kv_self.cells[i].pos = pos;
  8709. llama_seq_id seq_id;
  8710. for (size_t j = 0; j < seq_id_size; ++j) {
  8711. memcpy(&seq_id, inp, sizeof(seq_id)); inp += sizeof(seq_id);
  8712. ctx->kv_self.cells[i].seq_id.insert(seq_id);
  8713. }
  8714. }
  8715. }
  8716. const size_t nread = inp - src;
  8717. const size_t max_size = llama_get_state_size(ctx);
  8718. GGML_ASSERT(nread <= max_size);
  8719. return nread;
  8720. }
  8721. static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
  8722. llama_file file(path_session, "rb");
  8723. // sanity checks
  8724. {
  8725. const uint32_t magic = file.read_u32();
  8726. const uint32_t version = file.read_u32();
  8727. if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
  8728. LLAMA_LOG_ERROR("%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
  8729. return false;
  8730. }
  8731. llama_hparams session_hparams;
  8732. file.read_raw(&session_hparams, sizeof(llama_hparams));
  8733. if (session_hparams != ctx->model.hparams) {
  8734. LLAMA_LOG_INFO("%s : model hparams didn't match from session file!\n", __func__);
  8735. return false;
  8736. }
  8737. }
  8738. // load the prompt
  8739. {
  8740. const uint32_t n_token_count = file.read_u32();
  8741. if (n_token_count > n_token_capacity) {
  8742. LLAMA_LOG_ERROR("%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
  8743. return false;
  8744. }
  8745. file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
  8746. *n_token_count_out = n_token_count;
  8747. }
  8748. // restore the context state
  8749. {
  8750. const size_t n_state_size_cur = file.size - file.tell();
  8751. const size_t n_state_size_max = llama_get_state_size(ctx);
  8752. if (n_state_size_cur > n_state_size_max) {
  8753. LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
  8754. return false;
  8755. }
  8756. std::vector<uint8_t> state_data(n_state_size_max);
  8757. file.read_raw(state_data.data(), n_state_size_cur);
  8758. llama_set_state_data(ctx, state_data.data());
  8759. }
  8760. return true;
  8761. }
  8762. bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
  8763. try {
  8764. return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
  8765. } catch (const std::exception & err) {
  8766. LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
  8767. return false;
  8768. }
  8769. }
  8770. bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
  8771. llama_file file(path_session, "wb");
  8772. file.write_u32(LLAMA_SESSION_MAGIC);
  8773. file.write_u32(LLAMA_SESSION_VERSION);
  8774. file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
  8775. // save the prompt
  8776. file.write_u32((uint32_t) n_token_count);
  8777. file.write_raw(tokens, sizeof(llama_token) * n_token_count);
  8778. // save the context state using stream saving
  8779. llama_data_file_context data_ctx(&file);
  8780. llama_copy_state_data_internal(ctx, &data_ctx);
  8781. return true;
  8782. }
  8783. int llama_eval(
  8784. struct llama_context * ctx,
  8785. llama_token * tokens,
  8786. int32_t n_tokens,
  8787. int32_t n_past) {
  8788. llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
  8789. const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
  8790. if (ret < 0) {
  8791. LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
  8792. }
  8793. return ret;
  8794. }
  8795. int llama_eval_embd(
  8796. struct llama_context * ctx,
  8797. float * embd,
  8798. int32_t n_tokens,
  8799. int32_t n_past) {
  8800. llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
  8801. llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
  8802. const int ret = llama_decode_internal(*ctx, batch);
  8803. if (ret < 0) {
  8804. LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
  8805. }
  8806. return ret;
  8807. }
  8808. void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
  8809. ctx->cparams.n_threads = n_threads;
  8810. ctx->cparams.n_threads_batch = n_threads_batch;
  8811. }
  8812. struct llama_batch llama_batch_get_one(
  8813. llama_token * tokens,
  8814. int32_t n_tokens,
  8815. llama_pos pos_0,
  8816. llama_seq_id seq_id) {
  8817. return {
  8818. /*n_tokens =*/ n_tokens,
  8819. /*tokens =*/ tokens,
  8820. /*embd =*/ nullptr,
  8821. /*pos =*/ nullptr,
  8822. /*n_seq_id =*/ nullptr,
  8823. /*seq_id =*/ nullptr,
  8824. /*logits =*/ nullptr,
  8825. /*all_pos_0 =*/ pos_0,
  8826. /*all_pos_1 =*/ 1,
  8827. /*all_seq_id =*/ seq_id,
  8828. };
  8829. }
  8830. struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
  8831. llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
  8832. if (embd) {
  8833. batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
  8834. } else {
  8835. batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
  8836. }
  8837. batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
  8838. batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens);
  8839. batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
  8840. for (int i = 0; i < n_tokens; ++i) {
  8841. batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
  8842. }
  8843. batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
  8844. return batch;
  8845. }
  8846. void llama_batch_free(struct llama_batch batch) {
  8847. if (batch.token) free(batch.token);
  8848. if (batch.embd) free(batch.embd);
  8849. if (batch.pos) free(batch.pos);
  8850. if (batch.n_seq_id) free(batch.n_seq_id);
  8851. if (batch.seq_id) {
  8852. for (int i = 0; i < batch.n_tokens; ++i) {
  8853. free(batch.seq_id[i]);
  8854. }
  8855. free(batch.seq_id);
  8856. }
  8857. if (batch.logits) free(batch.logits);
  8858. }
  8859. int32_t llama_decode(
  8860. struct llama_context * ctx,
  8861. struct llama_batch batch) {
  8862. const int ret = llama_decode_internal(*ctx, batch);
  8863. if (ret < 0) {
  8864. LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
  8865. }
  8866. return ret;
  8867. }
  8868. float * llama_get_logits(struct llama_context * ctx) {
  8869. return ctx->logits.data();
  8870. }
  8871. float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
  8872. assert(ctx->logits_valid.at(i));
  8873. return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
  8874. }
  8875. float * llama_get_embeddings(struct llama_context * ctx) {
  8876. return ctx->embedding.data();
  8877. }
  8878. const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
  8879. return model->vocab.id_to_token[token].text.c_str();
  8880. }
  8881. float llama_token_get_score(const struct llama_model * model, llama_token token) {
  8882. return model->vocab.id_to_token[token].score;
  8883. }
  8884. llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
  8885. return model->vocab.id_to_token[token].type;
  8886. }
  8887. llama_token llama_token_bos(const struct llama_model * model) {
  8888. return model->vocab.special_bos_id;
  8889. }
  8890. llama_token llama_token_eos(const struct llama_model * model) {
  8891. return model->vocab.special_eos_id;
  8892. }
  8893. llama_token llama_token_nl(const struct llama_model * model) {
  8894. return model->vocab.linefeed_id;
  8895. }
  8896. int32_t llama_add_bos_token(const struct llama_model * model) {
  8897. return model->vocab.special_add_bos;
  8898. }
  8899. int32_t llama_add_eos_token(const struct llama_model * model) {
  8900. return model->vocab.special_add_eos;
  8901. }
  8902. llama_token llama_token_prefix(const struct llama_model * model) {
  8903. return model->vocab.special_prefix_id;
  8904. }
  8905. llama_token llama_token_middle(const struct llama_model * model) {
  8906. return model->vocab.special_middle_id;
  8907. }
  8908. llama_token llama_token_suffix(const struct llama_model * model) {
  8909. return model->vocab.special_suffix_id;
  8910. }
  8911. llama_token llama_token_eot(const struct llama_model * model) {
  8912. return model->vocab.special_eot_id;
  8913. }
  8914. int32_t llama_tokenize(
  8915. const struct llama_model * model,
  8916. const char * text,
  8917. int32_t text_len,
  8918. llama_token * tokens,
  8919. int32_t n_max_tokens,
  8920. bool add_bos,
  8921. bool special) {
  8922. auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
  8923. if (n_max_tokens < (int) res.size()) {
  8924. // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
  8925. return -((int) res.size());
  8926. }
  8927. for (size_t i = 0; i < res.size(); i++) {
  8928. tokens[i] = res[i];
  8929. }
  8930. return res.size();
  8931. }
  8932. static std::string llama_decode_text(const std::string & text) {
  8933. std::string decoded_text;
  8934. auto unicode_sequences = codepoints_from_utf8(text);
  8935. for (auto& unicode_sequence : unicode_sequences) {
  8936. decoded_text += unicode_to_bytes_bpe(codepoint_to_utf8(unicode_sequence));
  8937. }
  8938. return decoded_text;
  8939. }
  8940. // does not write null-terminator to buf
  8941. int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
  8942. if (0 <= token && token < llama_n_vocab(model)) {
  8943. switch (llama_vocab_get_type(model->vocab)) {
  8944. case LLAMA_VOCAB_TYPE_SPM: {
  8945. if (llama_is_normal_token(model->vocab, token)) {
  8946. std::string result = model->vocab.id_to_token[token].text;
  8947. llama_unescape_whitespace(result);
  8948. if (length < (int) result.length()) {
  8949. return -(int) result.length();
  8950. }
  8951. memcpy(buf, result.c_str(), result.length());
  8952. return result.length();
  8953. } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
  8954. if (length < 3) {
  8955. return -3;
  8956. }
  8957. memcpy(buf, "\xe2\x96\x85", 3);
  8958. return 3;
  8959. } else if (llama_is_control_token(model->vocab, token)) {
  8960. ;
  8961. } else if (llama_is_byte_token(model->vocab, token)) {
  8962. if (length < 1) {
  8963. return -1;
  8964. }
  8965. buf[0] = llama_token_to_byte(model->vocab, token);
  8966. return 1;
  8967. } else {
  8968. // TODO: for now we accept all unsupported token types,
  8969. // suppressing them like CONTROL tokens.
  8970. // GGML_ASSERT(false);
  8971. }
  8972. break;
  8973. }
  8974. case LLAMA_VOCAB_TYPE_BPE: {
  8975. if (llama_is_normal_token(model->vocab, token)) {
  8976. std::string result = model->vocab.id_to_token[token].text;
  8977. result = llama_decode_text(result);
  8978. if (length < (int) result.length()) {
  8979. return -(int) result.length();
  8980. }
  8981. memcpy(buf, result.c_str(), result.length());
  8982. return result.length();
  8983. } else if (llama_is_control_token(model->vocab, token)) {
  8984. ;
  8985. } else {
  8986. // TODO: for now we accept all unsupported token types,
  8987. // suppressing them like CONTROL tokens.
  8988. // GGML_ASSERT(false);
  8989. }
  8990. break;
  8991. }
  8992. default:
  8993. GGML_ASSERT(false);
  8994. }
  8995. }
  8996. return 0;
  8997. }
  8998. struct llama_timings llama_get_timings(struct llama_context * ctx) {
  8999. struct llama_timings result = {
  9000. /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
  9001. /*.t_end_ms =*/ 1.00 * ggml_time_ms(),
  9002. /*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
  9003. /*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
  9004. /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
  9005. /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
  9006. /*.n_sample =*/ std::max(1, ctx->n_sample),
  9007. /*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
  9008. /*.n_eval =*/ std::max(1, ctx->n_eval),
  9009. };
  9010. return result;
  9011. }
  9012. void llama_print_timings(struct llama_context * ctx) {
  9013. const llama_timings timings = llama_get_timings(ctx);
  9014. LLAMA_LOG_INFO("\n");
  9015. LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
  9016. LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
  9017. __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
  9018. LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
  9019. __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
  9020. LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
  9021. __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
  9022. LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
  9023. }
  9024. void llama_reset_timings(struct llama_context * ctx) {
  9025. ctx->t_start_us = ggml_time_us();
  9026. ctx->t_sample_us = ctx->n_sample = 0;
  9027. ctx->t_eval_us = ctx->n_eval = 0;
  9028. ctx->t_p_eval_us = ctx->n_p_eval = 0;
  9029. }
  9030. const char * llama_print_system_info(void) {
  9031. static std::string s;
  9032. s = "";
  9033. s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
  9034. s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | ";
  9035. s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
  9036. s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
  9037. s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
  9038. s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
  9039. s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
  9040. s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
  9041. s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
  9042. s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
  9043. s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
  9044. s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
  9045. s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
  9046. s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
  9047. s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
  9048. s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
  9049. return s.c_str();
  9050. }
  9051. void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
  9052. fprintf(stream, "\n");
  9053. fprintf(stream, "###########\n");
  9054. fprintf(stream, "# Timings #\n");
  9055. fprintf(stream, "###########\n");
  9056. fprintf(stream, "\n");
  9057. fprintf(stream, "mst_eval: %.2f # ms / token during generation\n",
  9058. 1.0e-3 * ctx->t_eval_us / ctx->n_eval);
  9059. fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n",
  9060. 1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
  9061. fprintf(stream, "mst_sample: %.2f # ms / token during sampling\n",
  9062. 1.0e-3 * ctx->t_sample_us / ctx->n_sample);
  9063. fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval);
  9064. fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
  9065. fprintf(stream, "n_sample: %d # number of sampled tokens\n", ctx->n_sample);
  9066. fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us);
  9067. fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us);
  9068. fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
  9069. fprintf(stream, "t_sample_us: %" PRId64 " # total microseconds spent sampling\n", ctx->t_sample_us);
  9070. fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n",
  9071. 1.0e6 * ctx->n_eval / ctx->t_eval_us);
  9072. fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n",
  9073. 1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
  9074. fprintf(stream, "ts_sample: %.2f # tokens / second during sampling\n",
  9075. 1.0e6 * ctx->n_sample / ctx->t_sample_us);
  9076. }
  9077. // For internal test use
  9078. const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
  9079. struct llama_context * ctx
  9080. ) {
  9081. return ctx->model.tensors_by_name;
  9082. }
  9083. void llama_log_set(ggml_log_callback log_callback, void * user_data) {
  9084. g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
  9085. g_state.log_callback_user_data = user_data;
  9086. #ifdef GGML_USE_METAL
  9087. ggml_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
  9088. #endif
  9089. }
  9090. static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
  9091. va_list args_copy;
  9092. va_copy(args_copy, args);
  9093. char buffer[128];
  9094. int len = vsnprintf(buffer, 128, format, args);
  9095. if (len < 128) {
  9096. g_state.log_callback(level, buffer, g_state.log_callback_user_data);
  9097. } else {
  9098. char* buffer2 = new char[len+1];
  9099. vsnprintf(buffer2, len+1, format, args_copy);
  9100. buffer2[len] = 0;
  9101. g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
  9102. delete[] buffer2;
  9103. }
  9104. va_end(args_copy);
  9105. }
  9106. static void llama_log_internal(ggml_log_level level, const char * format, ...) {
  9107. va_list args;
  9108. va_start(args, format);
  9109. llama_log_internal_v(level, format, args);
  9110. va_end(args);
  9111. }
  9112. static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
  9113. (void) level;
  9114. (void) user_data;
  9115. fputs(text, stderr);
  9116. fflush(stderr);
  9117. }