baby-llama.cpp 64 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696
  1. #include "ggml.h"
  2. #include <vector>
  3. #include <cassert>
  4. #include <random>
  5. #include <cstring>
  6. #if defined(_MSC_VER)
  7. #pragma warning(disable: 4244 4267) // possible loss of data
  8. #endif
  9. float frand() {
  10. return (float)rand()/(float)RAND_MAX;
  11. }
  12. struct random_normal_distribution {
  13. std::mt19937 gen;
  14. std::normal_distribution<float> nd;
  15. float min;
  16. float max;
  17. };
  18. void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
  19. rnd->gen = std::mt19937(seed);
  20. rnd->nd = std::normal_distribution<float>{mean, std};
  21. rnd->min = min;
  22. rnd->max = max;
  23. }
  24. float frand_normal(struct random_normal_distribution * rnd) {
  25. const float r = rnd->nd(rnd->gen);
  26. return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
  27. }
  28. struct ggml_tensor * randomize_tensor(
  29. struct ggml_tensor * tensor,
  30. int ndims,
  31. const int64_t ne[],
  32. float fmin,
  33. float fmax) {
  34. switch (ndims) {
  35. case 1:
  36. for (int i0 = 0; i0 < ne[0]; i0++) {
  37. ((float *)tensor->data)[i0] = frand()*(fmax - fmin) + fmin;
  38. }
  39. break;
  40. case 2:
  41. for (int i1 = 0; i1 < ne[1]; i1++) {
  42. for (int i0 = 0; i0 < ne[0]; i0++) {
  43. ((float *)tensor->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  44. }
  45. }
  46. break;
  47. case 3:
  48. for (int i2 = 0; i2 < ne[2]; i2++) {
  49. for (int i1 = 0; i1 < ne[1]; i1++) {
  50. for (int i0 = 0; i0 < ne[0]; i0++) {
  51. ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  52. }
  53. }
  54. }
  55. break;
  56. case 4:
  57. for (int i3 = 0; i3 < ne[3]; i3++) {
  58. for (int i2 = 0; i2 < ne[2]; i2++) {
  59. for (int i1 = 0; i1 < ne[1]; i1++) {
  60. for (int i0 = 0; i0 < ne[0]; i0++) {
  61. ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  62. }
  63. }
  64. }
  65. }
  66. break;
  67. default:
  68. assert(false);
  69. };
  70. return tensor;
  71. }
  72. struct ggml_tensor * randomize_tensor_normal(
  73. struct ggml_tensor * tensor,
  74. int ndims,
  75. const int64_t ne[],
  76. struct random_normal_distribution * rnd) {
  77. float scale = 1.0; // xavier
  78. switch (ndims) {
  79. case 1:
  80. scale /= sqrtf(ne[0]);
  81. for (int i0 = 0; i0 < ne[0]; i0++) {
  82. ((float *)tensor->data)[i0] = scale * frand_normal(rnd);
  83. }
  84. break;
  85. case 2:
  86. scale /= sqrtf(ne[0]+ne[1]);
  87. for (int i1 = 0; i1 < ne[1]; i1++) {
  88. for (int i0 = 0; i0 < ne[0]; i0++) {
  89. ((float *)tensor->data)[i1*ne[0] + i0] = scale * frand_normal(rnd);
  90. }
  91. }
  92. break;
  93. case 3:
  94. scale /= sqrtf(ne[0]+ne[1]);
  95. for (int i2 = 0; i2 < ne[2]; i2++) {
  96. for (int i1 = 0; i1 < ne[1]; i1++) {
  97. for (int i0 = 0; i0 < ne[0]; i0++) {
  98. ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
  99. }
  100. }
  101. }
  102. break;
  103. case 4:
  104. scale /= sqrtf(ne[0]+ne[1]);
  105. for (int i3 = 0; i3 < ne[3]; i3++) {
  106. for (int i2 = 0; i2 < ne[2]; i2++) {
  107. for (int i1 = 0; i1 < ne[1]; i1++) {
  108. for (int i0 = 0; i0 < ne[0]; i0++) {
  109. ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
  110. }
  111. }
  112. }
  113. }
  114. break;
  115. default:
  116. assert(false);
  117. };
  118. return tensor;
  119. }
  120. struct llama_hparams {
  121. uint32_t n_vocab = 32000;
  122. uint32_t n_ctx = 512; // this is provided as user input?
  123. uint32_t n_embd = 4096;
  124. uint32_t n_mult = 4;
  125. uint32_t n_head = 32;
  126. uint32_t n_layer = 32;
  127. uint32_t n_rot = 64;
  128. bool operator!=(const llama_hparams & other) const {
  129. return memcmp(this, &other, sizeof(llama_hparams));
  130. }
  131. };
  132. uint32_t get_n_ff(const struct llama_hparams* hparams) {
  133. const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
  134. return n_ff;
  135. }
  136. struct llama_hparams_lora {
  137. uint32_t n_vocab = 32000;
  138. uint32_t n_ctx = 512; // this is provided as user input?
  139. uint32_t n_embd = 4096;
  140. uint32_t n_mult = 4;
  141. uint32_t n_head = 32;
  142. uint32_t n_layer = 32;
  143. uint32_t n_rot = 64;
  144. uint32_t n_lora = 64;
  145. bool operator!=(const llama_hparams_lora & other) const {
  146. return memcmp(this, &other, sizeof(llama_hparams_lora)) != 0;
  147. }
  148. };
  149. struct llama_layer {
  150. // normalization
  151. struct ggml_tensor * attention_norm;
  152. // attention
  153. struct ggml_tensor * wq;
  154. struct ggml_tensor * wk;
  155. struct ggml_tensor * wv;
  156. struct ggml_tensor * wo;
  157. // normalization
  158. struct ggml_tensor * ffn_norm;
  159. // ff
  160. struct ggml_tensor * w1;
  161. struct ggml_tensor * w2;
  162. struct ggml_tensor * w3;
  163. };
  164. struct llama_layer_lora {
  165. // normalization
  166. struct ggml_tensor * attention_norm;
  167. // attention
  168. struct ggml_tensor * wqa;
  169. struct ggml_tensor * wqb;
  170. struct ggml_tensor * wka;
  171. struct ggml_tensor * wkb;
  172. struct ggml_tensor * wva;
  173. struct ggml_tensor * wvb;
  174. struct ggml_tensor * woa;
  175. struct ggml_tensor * wob;
  176. // normalization
  177. struct ggml_tensor * ffn_norm;
  178. // ff
  179. struct ggml_tensor * w1;
  180. struct ggml_tensor * w2;
  181. struct ggml_tensor * w3;
  182. };
  183. struct llama_kv_cache {
  184. struct ggml_context * ctx = NULL;
  185. struct ggml_tensor * k;
  186. struct ggml_tensor * v;
  187. // llama_ctx_buffer buf;
  188. int n; // number of tokens currently in the cache
  189. };
  190. struct llama_model {
  191. struct ggml_context * ctx = NULL;
  192. llama_hparams hparams;
  193. struct ggml_tensor * tok_embeddings;
  194. struct ggml_tensor * norm;
  195. struct ggml_tensor * output;
  196. std::vector<llama_layer> layers;
  197. };
  198. struct llama_model_lora {
  199. struct ggml_context * ctx = NULL;
  200. llama_hparams_lora hparams;
  201. struct ggml_tensor * tok_embeddings;
  202. struct ggml_tensor * norm;
  203. struct ggml_tensor * outputa;
  204. struct ggml_tensor * outputb;
  205. std::vector<llama_layer_lora> layers;
  206. };
  207. void init_model(struct llama_model * model) {
  208. const auto & hparams = model->hparams;
  209. const uint32_t n_embd = hparams.n_embd;
  210. const uint32_t n_layer = hparams.n_layer;
  211. const uint32_t n_vocab = hparams.n_vocab;
  212. const uint32_t n_ff = get_n_ff(&hparams);
  213. struct ggml_context * ctx = model->ctx;
  214. model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab});
  215. model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // ("norm.weight", {n_embd});
  216. model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("output.weight", {n_embd, n_vocab});
  217. model->layers.resize(n_layer);
  218. for (uint32_t i = 0; i < n_layer; ++i) {
  219. auto & layer = model->layers[i];
  220. // std::string layers_i = "layers." + std::to_string(i);
  221. layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd});
  222. layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
  223. layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
  224. layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
  225. layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
  226. layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".ffn_norm.weight", {n_embd});
  227. layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
  228. layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); // (layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
  229. layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
  230. }
  231. }
  232. void init_model_lora(struct llama_model_lora * model) {
  233. const auto & hparams = model->hparams;
  234. const uint32_t n_embd = hparams.n_embd;
  235. const uint32_t n_mult = hparams.n_mult;
  236. const uint32_t n_layer = hparams.n_layer;
  237. const uint32_t n_vocab = hparams.n_vocab;
  238. const uint32_t n_lora = hparams.n_lora;
  239. const uint32_t n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult;
  240. struct ggml_context * ctx = model->ctx;
  241. model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab});
  242. model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // ("norm.weight", {n_embd});
  243. model->outputa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_vocab); // ("output.weight", {n_embd, n_vocab});
  244. model->outputb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // ("output.weight", {n_embd, n_vocab});
  245. model->layers.resize(n_layer);
  246. for (uint32_t i = 0; i < n_layer; ++i) {
  247. auto & layer = model->layers[i];
  248. // std::string layers_i = "layers." + std::to_string(i);
  249. layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd});
  250. layer.wqa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
  251. layer.wqb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
  252. layer.wka = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
  253. layer.wkb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
  254. layer.wva = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
  255. layer.wvb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
  256. layer.woa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
  257. layer.wob = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
  258. layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".ffn_norm.weight", {n_embd});
  259. layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
  260. layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); // (layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
  261. layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
  262. }
  263. }
  264. void set_param_model(struct llama_model * model) {
  265. const auto& hparams = model->hparams;
  266. const uint32_t n_layer = hparams.n_layer;
  267. struct ggml_context* ctx = model->ctx;
  268. ggml_set_param(ctx, model->tok_embeddings);
  269. ggml_set_param(ctx, model->norm);
  270. ggml_set_param(ctx, model->output);
  271. for (uint32_t i = 0; i < n_layer; ++i) {
  272. auto & layer = model->layers[i];
  273. ggml_set_param(ctx, layer.attention_norm);
  274. ggml_set_param(ctx, layer.wq);
  275. ggml_set_param(ctx, layer.wk);
  276. ggml_set_param(ctx, layer.wv);
  277. ggml_set_param(ctx, layer.wo);
  278. ggml_set_param(ctx, layer.ffn_norm);
  279. ggml_set_param(ctx, layer.w1);
  280. ggml_set_param(ctx, layer.w2);
  281. ggml_set_param(ctx, layer.w3);
  282. }
  283. }
  284. void set_param_model_lora(struct llama_model_lora * model) {
  285. const auto& hparams = model->hparams;
  286. const uint32_t n_layer = hparams.n_layer;
  287. struct ggml_context* ctx = model->ctx;
  288. ggml_set_param(ctx, model->tok_embeddings);
  289. ggml_set_param(ctx, model->norm);
  290. ggml_set_param(ctx, model->outputa);
  291. ggml_set_param(ctx, model->outputb);
  292. for (uint32_t i = 0; i < n_layer; ++i) {
  293. auto & layer = model->layers[i];
  294. ggml_set_param(ctx, layer.attention_norm);
  295. ggml_set_param(ctx, layer.wqa);
  296. ggml_set_param(ctx, layer.wqb);
  297. ggml_set_param(ctx, layer.wka);
  298. ggml_set_param(ctx, layer.wkb);
  299. ggml_set_param(ctx, layer.wva);
  300. ggml_set_param(ctx, layer.wvb);
  301. ggml_set_param(ctx, layer.woa);
  302. ggml_set_param(ctx, layer.wob);
  303. ggml_set_param(ctx, layer.ffn_norm);
  304. ggml_set_param(ctx, layer.w1);
  305. ggml_set_param(ctx, layer.w2);
  306. ggml_set_param(ctx, layer.w3);
  307. }
  308. }
  309. void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
  310. const auto & hparams = model->hparams;
  311. const uint32_t n_layer = hparams.n_layer;
  312. struct random_normal_distribution rnd;
  313. init_random_normal_distribution(&rnd, seed, mean, std, min, max);
  314. randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
  315. randomize_tensor_normal(model->norm, model->norm->n_dims, model->norm->ne, &rnd);
  316. randomize_tensor_normal(model->output, model->output->n_dims, model->output->ne, &rnd);
  317. for (uint32_t i = 0; i < n_layer; ++i) {
  318. auto & layer = model->layers[i];
  319. randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);
  320. randomize_tensor_normal(layer.wq, layer.wq->n_dims, layer.wq->ne, &rnd);
  321. randomize_tensor_normal(layer.wk, layer.wk->n_dims, layer.wk->ne, &rnd);
  322. randomize_tensor_normal(layer.wv, layer.wv->n_dims, layer.wv->ne, &rnd);
  323. randomize_tensor_normal(layer.wo, layer.wo->n_dims, layer.wo->ne, &rnd);
  324. randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);
  325. randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
  326. randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
  327. randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
  328. }
  329. }
  330. void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, float std, float min, float max) {
  331. const auto & hparams = model->hparams;
  332. const uint32_t n_layer = hparams.n_layer;
  333. struct random_normal_distribution rnd;
  334. init_random_normal_distribution(&rnd, seed, mean, std, min, max);
  335. randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
  336. randomize_tensor_normal(model->norm, model->norm->n_dims, model->norm->ne, &rnd);
  337. randomize_tensor_normal(model->outputa, model->outputa->n_dims, model->outputa->ne, &rnd);
  338. randomize_tensor_normal(model->outputb, model->outputb->n_dims, model->outputb->ne, &rnd);
  339. for (uint32_t i = 0; i < n_layer; ++i) {
  340. auto & layer = model->layers[i];
  341. randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);
  342. randomize_tensor_normal(layer.wqa, layer.wqa->n_dims, layer.wqa->ne, &rnd);
  343. randomize_tensor_normal(layer.wqb, layer.wqb->n_dims, layer.wqb->ne, &rnd);
  344. randomize_tensor_normal(layer.wka, layer.wka->n_dims, layer.wka->ne, &rnd);
  345. randomize_tensor_normal(layer.wkb, layer.wkb->n_dims, layer.wkb->ne, &rnd);
  346. randomize_tensor_normal(layer.wva, layer.wva->n_dims, layer.wva->ne, &rnd);
  347. randomize_tensor_normal(layer.wvb, layer.wvb->n_dims, layer.wvb->ne, &rnd);
  348. randomize_tensor_normal(layer.woa, layer.woa->n_dims, layer.woa->ne, &rnd);
  349. randomize_tensor_normal(layer.wob, layer.wob->n_dims, layer.wob->ne, &rnd);
  350. randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);
  351. randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
  352. randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
  353. randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
  354. }
  355. }
  356. bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
  357. const auto & hparams = model->hparams;
  358. const uint32_t n_ctx = hparams.n_ctx;
  359. const uint32_t n_embd = hparams.n_embd;
  360. const uint32_t n_layer = hparams.n_layer;
  361. const int64_t n_mem = n_layer*n_ctx*n_batch;
  362. const int64_t n_elements = n_embd*n_mem;
  363. // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
  364. // struct ggml_init_params params;
  365. // params.mem_size = cache.buf.size;
  366. // params.mem_buffer = cache.buf.addr;
  367. // params.no_alloc = false;
  368. if (!cache->ctx) {
  369. struct ggml_init_params params;
  370. params.mem_size = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
  371. params.mem_buffer = NULL;
  372. params.no_alloc = false;
  373. cache->ctx = ggml_init(params);
  374. if (!cache->ctx) {
  375. fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
  376. return false;
  377. }
  378. }
  379. cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
  380. cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
  381. return true;
  382. }
  383. bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
  384. const auto & hparams = model->hparams;
  385. const uint32_t n_ctx = hparams.n_ctx;
  386. const uint32_t n_embd = hparams.n_embd;
  387. const uint32_t n_layer = hparams.n_layer;
  388. const int64_t n_mem = n_layer*n_ctx*n_batch;
  389. const int64_t n_elements = n_embd*n_mem;
  390. // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
  391. // struct ggml_init_params params;
  392. // params.mem_size = cache.buf.size;
  393. // params.mem_buffer = cache.buf.addr;
  394. // params.no_alloc = false;
  395. if (!cache->ctx) {
  396. struct ggml_init_params params;
  397. params.mem_size = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
  398. params.mem_buffer = NULL;
  399. params.no_alloc = false;
  400. cache->ctx = ggml_init(params);
  401. if (!cache->ctx) {
  402. fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
  403. return false;
  404. }
  405. }
  406. cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
  407. cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
  408. return true;
  409. }
  410. struct ggml_tensor * forward(
  411. struct llama_model * model,
  412. struct llama_kv_cache * cache,
  413. struct ggml_context * ctx0,
  414. struct ggml_cgraph * gf,
  415. struct ggml_tensor * tokens_input,
  416. const int n_tokens,
  417. const int n_past) {
  418. const int N = n_tokens;
  419. struct llama_kv_cache& kv_self = *cache;
  420. const auto & hparams = model->hparams;
  421. const int n_ctx = hparams.n_ctx;
  422. const int n_embd = hparams.n_embd;
  423. const int n_layer = hparams.n_layer;
  424. const int n_head = hparams.n_head;
  425. const int n_rot = hparams.n_rot;
  426. struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
  427. memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
  428. struct ggml_tensor * kc = kv_self.k;
  429. struct ggml_tensor * vc = kv_self.v;
  430. // inpL shape [n_embd,N,1,1]
  431. struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
  432. for (int il = 0; il < n_layer; ++il) {
  433. struct ggml_tensor * inpSA = inpL;
  434. struct ggml_tensor * cur;
  435. // lctx.use_buf(ctx0, 0);
  436. // norm
  437. {
  438. // cur shape [n_embd,N,1,1]
  439. cur = ggml_rms_norm(ctx0, inpL);
  440. // cur = attention_norm*cur
  441. cur = ggml_mul(ctx0,
  442. ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
  443. cur);
  444. }
  445. // self-attention
  446. {
  447. // compute Q and K and RoPE them
  448. // wq shape [n_embd, n_embd, 1, 1]
  449. // wk shape [n_embd, n_embd, 1, 1]
  450. // Qcur shape [n_embd/n_head, n_head, N, 1]
  451. // Kcur shape [n_embd/n_head, n_head, N, 1]
  452. struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
  453. struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
  454. // store key and value to memory
  455. {
  456. // compute the transposed [N, n_embd] V matrix
  457. // wv shape [n_embd, n_embd, 1, 1]
  458. // Vcur shape [n_embd, N, 1, 1]
  459. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N)));
  460. // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
  461. // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
  462. // k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0]
  463. // v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
  464. /* {
  465. struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
  466. struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
  467. ( n_ctx)*ggml_element_size(kv_self.v),
  468. (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
  469. // important: storing RoPE-ed version of K in the KV cache!
  470. ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
  471. ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
  472. } //*/
  473. kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
  474. vc = ggml_set_2d(ctx0, vc, Vcur, ( n_ctx)*ggml_element_size(kv_self.v),
  475. (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
  476. }
  477. // Qcur shape [n_embd/n_head, n_head, N, 1]
  478. // Q shape [n_embd/n_head, N, n_head, 1]
  479. struct ggml_tensor * Q =
  480. ggml_permute(ctx0,
  481. Qcur,
  482. 0, 2, 1, 3);
  483. // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
  484. // K shape [n_embd/n_head, n_past + N, n_head, 1]
  485. struct ggml_tensor * K =
  486. ggml_permute(ctx0,
  487. ggml_reshape_3d(ctx0,
  488. ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd),
  489. n_embd/n_head, n_head, n_past + N),
  490. 0, 2, 1, 3);
  491. // K * Q
  492. // KQ shape [n_past + N, N, n_head, 1]
  493. struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
  494. // KQ_scaled = KQ / sqrt(n_embd/n_head)
  495. // KQ_scaled shape [n_past + N, N, n_head, 1]
  496. struct ggml_tensor * KQ_scaled =
  497. ggml_scale(ctx0,
  498. KQ,
  499. ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
  500. // KQ_masked = mask_past(KQ_scaled)
  501. // KQ_masked shape [n_past + N, N, n_head, 1]
  502. struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
  503. // KQ = soft_max(KQ_masked)
  504. // KQ_soft_max shape [n_past + N, N, n_head, 1]
  505. struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
  506. // split cached V into n_head heads
  507. //// V shape [n_past + N, n_embd/n_head, n_head, 1]
  508. // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1]
  509. struct ggml_tensor * V =
  510. ggml_view_3d(ctx0, vc,
  511. n_past + N, n_embd/n_head, n_head,
  512. n_ctx*ggml_element_size(vc),
  513. n_ctx*ggml_element_size(vc)*n_embd/n_head,
  514. il*n_ctx*ggml_element_size(vc)*n_embd);
  515. // KQV shape [n_embd/n_head, N, n_head, 1]
  516. struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
  517. // KQV_merged = KQV.permute(0, 2, 1, 3)
  518. // KQV_merged shape [n_embd/n_head, n_head, N, 1]
  519. struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
  520. // KQV_merged shape
  521. // cur = KQV_merged.contiguous().view(n_embd, N)
  522. // cur shape [n_embd,N,1,1]
  523. cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N);
  524. // cur = ggml_cpy(ctx0,
  525. // KQV_merged,
  526. // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
  527. // projection (no bias)
  528. // cur shape [n_embd,N,1,1]
  529. cur = ggml_mul_mat(ctx0,
  530. model->layers[il].wo,
  531. cur);
  532. }
  533. // lctx.use_buf(ctx0, 1);
  534. // inpFF shape [n_embd,N,1,1]
  535. struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
  536. // feed-forward network
  537. {
  538. // norm
  539. {
  540. // cur shape [n_embd,N,1,1]
  541. cur = ggml_rms_norm(ctx0, inpFF);
  542. // cur = ffn_norm*cur
  543. // cur shape [n_embd,N,1,1]
  544. cur = ggml_mul(ctx0,
  545. ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
  546. cur);
  547. }
  548. // tmp shape [n_ff,N,1,1]
  549. struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
  550. model->layers[il].w3,
  551. cur);
  552. // cur shape [n_ff,N,1,1]
  553. cur = ggml_mul_mat(ctx0,
  554. model->layers[il].w1,
  555. cur);
  556. // SILU activation
  557. // cur shape [n_ff,N,1,1]
  558. cur = ggml_silu(ctx0, cur);
  559. // cur shape [n_ff,N,1,1]
  560. cur = ggml_mul(ctx0, cur, tmp);
  561. // cur shape [n_embd,N,1,1]
  562. cur = ggml_mul_mat(ctx0,
  563. model->layers[il].w2,
  564. cur);
  565. }
  566. // cur shape [n_embd,N,1,1]
  567. cur = ggml_add(ctx0, cur, inpFF);
  568. // input for next layer
  569. // inpL shape [n_embd,N,1,1]
  570. inpL = cur;
  571. }
  572. // norm
  573. {
  574. // inpL shape [n_embd,N,1,1]
  575. inpL = ggml_rms_norm(ctx0, inpL);
  576. // inpL = norm*inpL
  577. // inpL shape [n_embd,N,1,1]
  578. inpL = ggml_mul(ctx0,
  579. ggml_repeat(ctx0, model->norm, inpL),
  580. inpL);
  581. //embeddings = inpL;
  582. }
  583. // lm_head
  584. // inpL shape [n_vocab,N,1,1]
  585. inpL = ggml_mul_mat(ctx0, model->output, inpL);
  586. // run the computation
  587. ggml_build_forward_expand(gf, inpL);
  588. return inpL;
  589. }
  590. void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
  591. GGML_ASSERT(tensor->n_dims == 1);
  592. GGML_ASSERT(tensor->ne[0] == ne0);
  593. }
  594. void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
  595. GGML_ASSERT(tensor->n_dims == 2);
  596. GGML_ASSERT(tensor->ne[0] == ne0);
  597. GGML_ASSERT(tensor->ne[1] == ne1);
  598. }
  599. void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
  600. GGML_ASSERT(tensor->n_dims == 3);
  601. GGML_ASSERT(tensor->ne[0] == ne0);
  602. GGML_ASSERT(tensor->ne[1] == ne1);
  603. GGML_ASSERT(tensor->ne[2] == ne2);
  604. }
  605. void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
  606. GGML_ASSERT(tensor->n_dims == 4);
  607. GGML_ASSERT(tensor->ne[0] == ne0);
  608. GGML_ASSERT(tensor->ne[1] == ne1);
  609. GGML_ASSERT(tensor->ne[2] == ne2);
  610. GGML_ASSERT(tensor->ne[3] == ne3);
  611. }
  612. struct ggml_tensor * forward_batch(
  613. struct llama_model * model,
  614. struct llama_kv_cache * cache,
  615. struct ggml_context * ctx0,
  616. struct ggml_cgraph * gf,
  617. struct ggml_tensor * tokens_input,
  618. const int n_tokens,
  619. const int n_past,
  620. const int n_batch) {
  621. const int N = n_tokens;
  622. struct llama_kv_cache& kv_self = *cache;
  623. const auto & hparams = model->hparams;
  624. const int n_ctx = hparams.n_ctx;
  625. const int n_vocab = hparams.n_vocab;
  626. const int n_embd = hparams.n_embd;
  627. const int n_layer = hparams.n_layer;
  628. const int n_head = hparams.n_head;
  629. const int n_rot = hparams.n_rot;
  630. const int n_ff = get_n_ff(&hparams);
  631. struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch);
  632. memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch);
  633. struct ggml_tensor * kc = kv_self.k;
  634. struct ggml_tensor * vc = kv_self.v;
  635. // inpL shape [n_embd,N*n_batch,1]
  636. struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
  637. assert_shape_2d(inpL, n_embd, N*n_batch);
  638. for (int il = 0; il < n_layer; ++il) {
  639. struct ggml_tensor * inpSA = inpL;
  640. struct ggml_tensor * cur;
  641. // lctx.use_buf(ctx0, 0);
  642. // norm
  643. {
  644. // cur shape [n_embd,N*n_batch,1,1]
  645. cur = ggml_rms_norm(ctx0, inpL);
  646. assert_shape_2d(cur, n_embd, N*n_batch);
  647. // cur = attention_norm*cur
  648. cur = ggml_mul(ctx0,
  649. ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
  650. cur);
  651. assert_shape_2d(cur, n_embd, N*n_batch);
  652. }
  653. // self-attention
  654. {
  655. // compute Q and K and RoPE them
  656. // wq shape [n_embd, n_embd, 1, 1]
  657. // wk shape [n_embd, n_embd, 1, 1]
  658. // Qcur shape [n_embd/n_head, n_head, N, n_batch]
  659. // Kcur shape [n_embd/n_head, n_head, N, n_batch]
  660. struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
  661. struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
  662. assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
  663. assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
  664. // store key and value to memory
  665. {
  666. // compute the transposed [N, n_embd] V matrix
  667. // wv shape [n_embd, n_embd, 1, 1]
  668. // Vcur shape [N, n_embd, n_batch, 1]
  669. struct ggml_tensor * Vcur = ggml_cont(ctx0,
  670. ggml_permute(ctx0,
  671. ggml_reshape_3d(ctx0,
  672. ggml_mul_mat(ctx0,
  673. model->layers[il].wv,
  674. cur),
  675. n_embd, N, n_batch),
  676. 1, 0, 2, 3));
  677. assert_shape_3d(Vcur, N, n_embd, n_batch);
  678. // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
  679. // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
  680. // k shape [n_embd * N, n_batch] == kv_self.k[:,n_past:n_past+N,:,il]
  681. // v shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il]
  682. /* {
  683. struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
  684. struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
  685. ( n_ctx)*ggml_element_size(kv_self.v),
  686. (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
  687. // important: storing RoPE-ed version of K in the KV cache!
  688. ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
  689. ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
  690. } //*/
  691. kc = ggml_set_2d(ctx0, kc,
  692. ggml_reshape_2d(ctx0, Kcur, n_embd*N, n_batch),
  693. ggml_element_size(kc)*n_embd*n_ctx,
  694. (ggml_element_size(kc)*n_embd)*(il*n_batch*n_ctx + n_past));
  695. vc = ggml_set_2d(ctx0, vc,
  696. ggml_reshape_2d(ctx0, Vcur, N*n_embd, n_batch),
  697. ggml_element_size(vc)*n_ctx*n_embd,
  698. ggml_element_size(vc)*(n_past + il*n_embd*n_batch*n_ctx));
  699. assert_shape_1d(kc, n_embd * n_ctx * n_batch * n_layer);
  700. assert_shape_1d(vc, n_embd * n_ctx * n_batch * n_layer);
  701. }
  702. // Qcur shape [n_embd/n_head, n_head, N, n_batch]
  703. // Q shape [n_embd/n_head, N, n_head, n_batch]
  704. struct ggml_tensor * Q =
  705. ggml_permute(ctx0,
  706. Qcur,
  707. 0, 2, 1, 3);
  708. assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
  709. // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
  710. // K shape [n_embd/n_head, n_past + N, n_head, n_batch]
  711. struct ggml_tensor * K =
  712. ggml_permute(ctx0,
  713. ggml_reshape_4d(ctx0,
  714. ggml_view_3d(ctx0,
  715. kc,
  716. n_embd,
  717. (n_past + N),
  718. n_batch,
  719. n_embd*ggml_element_size(kc),
  720. n_ctx*n_embd*ggml_element_size(kc),
  721. il*n_batch*n_ctx*n_embd*ggml_element_size(kc)),
  722. n_embd/n_head, n_head, n_past + N, n_batch),
  723. 0, 2, 1, 3);
  724. assert_shape_4d(K, n_embd/n_head, n_past + N, n_head, n_batch);
  725. // K * Q
  726. // KQ shape [n_past + N, N, n_head, n_batch]
  727. struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
  728. assert_shape_4d(KQ, n_past + N, N, n_head, n_batch);
  729. // KQ_scaled = KQ / sqrt(n_embd/n_head)
  730. // KQ_scaled shape [n_past + N, N, n_head, n_batch]
  731. struct ggml_tensor * KQ_scaled =
  732. ggml_scale(ctx0,
  733. KQ,
  734. ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
  735. assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch);
  736. // KQ_masked = mask_past(KQ_scaled)
  737. // KQ_masked shape [n_past + N, N, n_head, n_batch]
  738. struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
  739. assert_shape_4d(KQ_masked, n_past + N, N, n_head, n_batch);
  740. // KQ = soft_max(KQ_masked)
  741. // KQ_soft_max shape [n_past + N, N, n_head, n_batch]
  742. struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
  743. assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch);
  744. // split cached V into n_head heads
  745. // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
  746. // V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il]
  747. struct ggml_tensor * V =
  748. ggml_view_4d(ctx0, vc,
  749. n_past + N, n_embd/n_head, n_head, n_batch,
  750. ggml_element_size(vc)*n_ctx,
  751. ggml_element_size(vc)*n_ctx*n_embd/n_head,
  752. ggml_element_size(vc)*n_ctx*n_embd,
  753. il*n_batch*n_ctx*n_embd*ggml_element_size(vc));
  754. assert_shape_4d(V, n_past + N, n_embd/n_head, n_head, n_batch);
  755. // KQV shape [n_embd/n_head, N, n_head, n_batch]
  756. struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
  757. assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch);
  758. // KQV_merged = KQV.permute(0, 2, 1, 3)
  759. // KQV_merged shape [n_embd/n_head, n_head, N, n_batch]
  760. struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
  761. assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch);
  762. // KQV_merged shape
  763. // cur = KQV_merged.contiguous().view(n_embd, N)
  764. // cur shape [n_embd,N*n_batch,1,1]
  765. cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch);
  766. assert_shape_2d(cur, n_embd, N*n_batch);
  767. // cur = ggml_cpy(ctx0,
  768. // KQV_merged,
  769. // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
  770. // projection (no bias)
  771. // cur shape [n_embd,N*n_batch,1,1]
  772. cur = ggml_mul_mat(ctx0,
  773. model->layers[il].wo,
  774. cur);
  775. assert_shape_2d(cur, n_embd, N*n_batch);
  776. }
  777. // lctx.use_buf(ctx0, 1);
  778. // inpFF shape [n_embd,N*n_batch,1,1]
  779. struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
  780. assert_shape_2d(inpFF, n_embd, N*n_batch);
  781. // feed-forward network
  782. {
  783. // norm
  784. {
  785. // cur shape [n_embd,N*n_batch,1,1]
  786. cur = ggml_rms_norm(ctx0, inpFF);
  787. assert_shape_2d(cur, n_embd, N*n_batch);
  788. // cur = ffn_norm*cur
  789. // cur shape [n_embd,N*n_batch,1,1]
  790. cur = ggml_mul(ctx0,
  791. ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
  792. cur);
  793. assert_shape_2d(cur, n_embd, N*n_batch);
  794. }
  795. // tmp shape [n_ff,N*n_batch,1,1]
  796. struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
  797. model->layers[il].w3,
  798. cur);
  799. assert_shape_2d(tmp, n_ff, N*n_batch);
  800. // cur shape [n_ff,N*n_batch,1,1]
  801. cur = ggml_mul_mat(ctx0,
  802. model->layers[il].w1,
  803. cur);
  804. assert_shape_2d(cur, n_ff, N*n_batch);
  805. // SILU activation
  806. // cur shape [n_ff,N*n_batch,1,1]
  807. cur = ggml_silu(ctx0, cur);
  808. assert_shape_2d(cur, n_ff, N*n_batch);
  809. // cur shape [n_ff,N*n_batch,1,1]
  810. cur = ggml_mul(ctx0, cur, tmp);
  811. assert_shape_2d(cur, n_ff, N*n_batch);
  812. // cur shape [n_embd,N*n_batch,1,1]
  813. cur = ggml_mul_mat(ctx0,
  814. model->layers[il].w2,
  815. cur);
  816. assert_shape_2d(cur, n_embd, N*n_batch);
  817. }
  818. // cur shape [n_embd,N*n_batch,1,1]
  819. cur = ggml_add(ctx0, cur, inpFF);
  820. assert_shape_2d(cur, n_embd, N*n_batch);
  821. // input for next layer
  822. // inpL shape [n_embd,N*n_batch,1,1]
  823. inpL = cur;
  824. assert_shape_2d(inpL, n_embd, N*n_batch);
  825. }
  826. // norm
  827. {
  828. // inpL shape [n_embd,N*n_batch,1,1]
  829. inpL = ggml_rms_norm(ctx0, inpL);
  830. assert_shape_2d(inpL, n_embd, N*n_batch);
  831. // inpL = norm*inpL
  832. // inpL shape [n_embd,N*n_batch,1,1]
  833. inpL = ggml_mul(ctx0,
  834. ggml_repeat(ctx0, model->norm, inpL),
  835. inpL);
  836. assert_shape_2d(inpL, n_embd, N*n_batch);
  837. //embeddings = inpL;
  838. }
  839. // lm_head
  840. // inpL shape [n_vocab,N*n_batch,1,1]
  841. inpL = ggml_mul_mat(ctx0, model->output, inpL);
  842. assert_shape_2d(inpL, n_vocab, N*n_batch);
  843. {
  844. // inpL shape [n_vocab,N,n_batch,1]
  845. inpL = ggml_reshape_3d(ctx0,
  846. inpL,
  847. n_vocab, N, n_batch);
  848. assert_shape_3d(inpL, n_vocab, N, n_batch);
  849. }
  850. // run the computation
  851. ggml_build_forward_expand(gf, inpL);
  852. return inpL;
  853. }
  854. struct ggml_tensor * forward_lora(
  855. struct llama_model_lora * model,
  856. struct llama_kv_cache * cache,
  857. struct ggml_context * ctx0,
  858. struct ggml_cgraph * gf,
  859. struct ggml_tensor * tokens_input,
  860. const int n_tokens,
  861. const int n_past) {
  862. const int N = n_tokens;
  863. struct llama_kv_cache& kv_self = *cache;
  864. const auto & hparams = model->hparams;
  865. const int n_ctx = hparams.n_ctx;
  866. const int n_embd = hparams.n_embd;
  867. const int n_layer = hparams.n_layer;
  868. const int n_head = hparams.n_head;
  869. const int n_rot = hparams.n_rot;
  870. struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
  871. memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
  872. struct ggml_tensor * kc = kv_self.k;
  873. struct ggml_tensor * vc = kv_self.v;
  874. // inpL shape [n_embd,N,1,1]
  875. struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
  876. for (int il = 0; il < n_layer; ++il) {
  877. struct ggml_tensor * inpSA = inpL;
  878. struct ggml_tensor * cur;
  879. // norm
  880. {
  881. // cur shape [n_embd,N,1,1]
  882. cur = ggml_rms_norm(ctx0, inpL);
  883. // cur = attention_norm*cur
  884. cur = ggml_mul(ctx0,
  885. ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
  886. cur);
  887. }
  888. // self-attention
  889. {
  890. // compute Q and K and RoPE them
  891. // wq shape [n_embd, n_embd, 1, 1]
  892. // wk shape [n_embd, n_embd, 1, 1]
  893. // Qcur shape [n_embd/n_head, n_head, N, 1]
  894. // Kcur shape [n_embd/n_head, n_head, N, 1]
  895. struct ggml_tensor * Qcur = ggml_rope(ctx0,
  896. ggml_reshape_3d(ctx0,
  897. ggml_mul_mat(ctx0,
  898. model->layers[il].wqa,
  899. ggml_mul_mat(ctx0,
  900. model->layers[il].wqb,
  901. cur)),
  902. n_embd/n_head, n_head, N),
  903. n_past, n_rot, 0, 0);
  904. struct ggml_tensor * Kcur = ggml_rope(ctx0,
  905. ggml_reshape_3d(ctx0,
  906. ggml_mul_mat(ctx0,
  907. model->layers[il].wka,
  908. ggml_mul_mat(ctx0,
  909. model->layers[il].wkb,
  910. cur)),
  911. n_embd/n_head, n_head, N),
  912. n_past, n_rot, 0, 0);
  913. // store key and value to memory
  914. {
  915. // compute the transposed [N, n_embd] V matrix
  916. // wv shape [n_embd, n_embd, 1, 1]
  917. // Vcur shape [n_embd, N, 1, 1]
  918. struct ggml_tensor * Vcur = ggml_cont(ctx0,
  919. ggml_transpose(ctx0,
  920. ggml_reshape_2d(ctx0,
  921. ggml_mul_mat(ctx0,
  922. model->layers[il].wva,
  923. ggml_mul_mat(ctx0,
  924. model->layers[il].wvb,
  925. cur)),
  926. n_embd, N)));
  927. // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
  928. // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
  929. // k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0]
  930. // v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
  931. /* {
  932. struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
  933. struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
  934. ( n_ctx)*ggml_element_size(kv_self.v),
  935. (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
  936. // important: storing RoPE-ed version of K in the KV cache!
  937. ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
  938. ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
  939. } //*/
  940. kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
  941. vc = ggml_set_2d(ctx0, vc, Vcur, ( n_ctx)*ggml_element_size(kv_self.v),
  942. (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
  943. }
  944. // Qcur shape [n_embd/n_head, n_head, N, 1]
  945. // Q shape [n_embd/n_head, N, n_head, 1]
  946. struct ggml_tensor * Q =
  947. ggml_permute(ctx0,
  948. Qcur,
  949. 0, 2, 1, 3);
  950. // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
  951. // K shape [n_embd/n_head, n_past + N, n_head, 1]
  952. struct ggml_tensor * K =
  953. ggml_permute(ctx0,
  954. ggml_reshape_3d(ctx0,
  955. ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd),
  956. n_embd/n_head, n_head, n_past + N),
  957. 0, 2, 1, 3);
  958. // K * Q
  959. // KQ shape [n_past + N, N, n_head, 1]
  960. struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
  961. // KQ_scaled = KQ / sqrt(n_embd/n_head)
  962. // KQ_scaled shape [n_past + N, N, n_head, 1]
  963. struct ggml_tensor * KQ_scaled =
  964. ggml_scale(ctx0,
  965. KQ,
  966. ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
  967. // KQ_masked = mask_past(KQ_scaled)
  968. // KQ_masked shape [n_past + N, N, n_head, 1]
  969. struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
  970. // KQ = soft_max(KQ_masked)
  971. // KQ_soft_max shape [n_past + N, N, n_head, 1]
  972. struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
  973. // split cached V into n_head heads
  974. //// V shape [n_past + N, n_embd/n_head, n_head, 1]
  975. // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1]
  976. struct ggml_tensor * V =
  977. ggml_view_3d(ctx0, vc,
  978. n_past + N, n_embd/n_head, n_head,
  979. n_ctx*ggml_element_size(vc),
  980. n_ctx*ggml_element_size(vc)*n_embd/n_head,
  981. il*n_ctx*ggml_element_size(vc)*n_embd);
  982. // KQV shape [n_embd/n_head, N, n_head, 1]
  983. struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
  984. // KQV_merged = KQV.permute(0, 2, 1, 3)
  985. // KQV_merged shape [n_embd/n_head, n_head, N, 1]
  986. struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
  987. // KQV_merged shape
  988. // cur = KQV_merged.contiguous().view(n_embd, N)
  989. // cur shape [n_embd,N,1,1]
  990. cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N);
  991. // cur = ggml_cpy(ctx0,
  992. // KQV_merged,
  993. // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
  994. // projection (no bias)
  995. // cur shape [n_embd,N,1,1]
  996. cur = ggml_mul_mat(ctx0,
  997. model->layers[il].woa,
  998. ggml_mul_mat(ctx0,
  999. model->layers[il].wob,
  1000. cur));
  1001. }
  1002. // inpFF shape [n_embd,N,1,1]
  1003. struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
  1004. // feed-forward network
  1005. {
  1006. // norm
  1007. {
  1008. // cur shape [n_embd,N,1,1]
  1009. cur = ggml_rms_norm(ctx0, inpFF);
  1010. // cur = ffn_norm*cur
  1011. // cur shape [n_embd,N,1,1]
  1012. cur = ggml_mul(ctx0,
  1013. ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
  1014. cur);
  1015. }
  1016. // tmp shape [n_ff,N,1,1]
  1017. struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
  1018. model->layers[il].w3,
  1019. cur);
  1020. // cur shape [n_ff,N,1,1]
  1021. cur = ggml_mul_mat(ctx0,
  1022. model->layers[il].w1,
  1023. cur);
  1024. // SILU activation
  1025. // cur shape [n_ff,N,1,1]
  1026. cur = ggml_silu(ctx0, cur);
  1027. // cur shape [n_ff,N,1,1]
  1028. cur = ggml_mul(ctx0, cur, tmp);
  1029. // cur shape [n_embd,N,1,1]
  1030. cur = ggml_mul_mat(ctx0,
  1031. model->layers[il].w2,
  1032. cur);
  1033. }
  1034. // cur shape [n_embd,N,1,1]
  1035. cur = ggml_add(ctx0, cur, inpFF);
  1036. // input for next layer
  1037. // inpL shape [n_embd,N,1,1]
  1038. inpL = cur;
  1039. }
  1040. // norm
  1041. {
  1042. // inpL shape [n_embd,N,1,1]
  1043. inpL = ggml_rms_norm(ctx0, inpL);
  1044. // inpL = norm*inpL
  1045. // inpL shape [n_embd,N,1,1]
  1046. inpL = ggml_mul(ctx0,
  1047. ggml_repeat(ctx0, model->norm, inpL),
  1048. inpL);
  1049. //embeddings = inpL;
  1050. }
  1051. // lm_head
  1052. // inpL shape [n_vocab,N,1,1]
  1053. inpL = ggml_mul_mat(ctx0,
  1054. model->outputa,
  1055. ggml_mul_mat(ctx0,
  1056. model->outputb,
  1057. inpL));
  1058. // ggml_set_scratch(ctx0, { 0, 0, nullptr, });
  1059. // run the computation
  1060. ggml_build_forward_expand(gf, inpL);
  1061. return inpL;
  1062. }
  1063. void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
  1064. assert(logits->n_dims == 2);
  1065. assert(probs->n_dims == 2);
  1066. assert(best_samples->n_dims == 1);
  1067. assert(logits->ne[1] == best_samples->ne[0]);
  1068. assert(logits->ne[0] == probs->ne[0]);
  1069. assert(logits->ne[1] == probs->ne[1]);
  1070. for (int i = 0; i < logits->ne[1]; ++i) {
  1071. float max_logit = ggml_get_f32_1d(logits, i * logits->ne[0]);
  1072. ggml_set_i32_1d(best_samples, i, 0);
  1073. for (int k = 0; k < logits->ne[0]; ++k) {
  1074. float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k);
  1075. if (logit > max_logit) {
  1076. max_logit = logit;
  1077. ggml_set_i32_1d(best_samples, i, k);
  1078. }
  1079. }
  1080. float psum = 0;
  1081. for (int k = 0; k < logits->ne[0]; ++k) {
  1082. float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k);
  1083. float p = (logit == -INFINITY) ? 0 : expf(logit - max_logit);
  1084. psum += p;
  1085. ggml_set_f32_1d(probs, i * probs->ne[0] + k, p);
  1086. }
  1087. for (int k = 0; k < logits->ne[0]; ++k) {
  1088. float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
  1089. ggml_set_f32_1d(probs, i * probs->ne[0] + k, p / psum);
  1090. }
  1091. }
  1092. }
  1093. void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
  1094. GGML_ASSERT(best_samples->n_dims == 2);
  1095. GGML_ASSERT(logits->n_dims == 3);
  1096. GGML_ASSERT(probs->n_dims == 3);
  1097. int n_tokens = best_samples->ne[0];
  1098. int n_batch = best_samples->ne[1];
  1099. int n_vocab = logits->ne[0];
  1100. GGML_ASSERT(n_tokens == logits->ne[1]);
  1101. GGML_ASSERT(n_batch == logits->ne[2]);
  1102. GGML_ASSERT(n_vocab == probs->ne[0]);
  1103. GGML_ASSERT(n_tokens == probs->ne[1]);
  1104. GGML_ASSERT(n_batch == probs->ne[2]);
  1105. for (int k = 0; k < n_batch; ++k) {
  1106. struct ggml_tensor * best_samples_k = ggml_view_1d(ctx,
  1107. best_samples,
  1108. best_samples->ne[0],
  1109. k*best_samples->nb[1]);
  1110. struct ggml_tensor * logits_k = ggml_view_2d(ctx,
  1111. logits,
  1112. logits->ne[0],
  1113. logits->ne[1],
  1114. logits->nb[1],
  1115. k*logits->nb[2]);
  1116. struct ggml_tensor * probs_k = ggml_view_2d(ctx,
  1117. probs,
  1118. probs->ne[0],
  1119. probs->ne[1],
  1120. probs->nb[1],
  1121. k*probs->nb[2]);
  1122. sample_softmax(logits_k, probs_k, best_samples_k);
  1123. }
  1124. }
  1125. void print_row(struct ggml_tensor * probs, int i) {
  1126. for (int k = 0; k < probs->ne[0]; ++k) {
  1127. float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
  1128. printf(" %.2f", p);
  1129. }
  1130. printf("\n");
  1131. }
  1132. void print_matrix(struct ggml_tensor * probs) {
  1133. assert(probs->n_dims == 2);
  1134. for (int i = 0; i < probs->ne[1]; ++i) {
  1135. for (int k = 0; k < probs->ne[0]; ++k) {
  1136. float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
  1137. printf(" %.2f", p);
  1138. }
  1139. printf("\n");
  1140. }
  1141. }
  1142. void print_token(int token, int n_vocab) {
  1143. for (int k = 0; k < token; ++k) {
  1144. printf(" ");
  1145. }
  1146. printf("X");
  1147. for (int k = token+1; k < n_vocab; ++k) {
  1148. printf(" ");
  1149. }
  1150. printf("\n");
  1151. }
  1152. void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
  1153. for (int i=0; i<tokens->ne[0]; ++i) {
  1154. int token = ggml_get_i32_1d(tokens, i);
  1155. print_token(token, n_vocab);
  1156. }
  1157. }
  1158. void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
  1159. int n_tokens = tokens_input->ne[0];
  1160. int n_vocab = targets->ne[0];
  1161. float randomness = 0.0f;
  1162. // ggml_set_zero(targets);
  1163. ggml_set_f32(targets, -1.0f);
  1164. ggml_set_i32_1d(tokens_input, 0, 0);
  1165. for (int i=1; i<n_tokens+1; ++i) {
  1166. float x = example_id + i * 3.14159f * 2.0f * 1.0f * 0.5f / n_tokens;
  1167. float y = sinf(x);//*cosf(x*1.1f+1.0f);
  1168. float z = (y+1.0f)*0.5f; // scale to [0..1]
  1169. z += (frand()-0.5f)*(randomness/n_vocab);
  1170. z = (z < 0.0f) ? 0.0f : (z > 1.0f) ? 1.0f : z; // clamp to [0..1]
  1171. int token = std::max(1,std::min(1+(int)(z*(float)(n_vocab-1)), n_vocab-1));
  1172. ggml_set_f32_1d(targets, (i-1)*n_vocab + token, +1.0f);
  1173. if (i<n_tokens) {
  1174. ggml_set_i32_1d(tokens_input, i, token);
  1175. }
  1176. }
  1177. }
  1178. void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
  1179. GGML_ASSERT(tokens_input->n_dims == 2);
  1180. GGML_ASSERT( targets->n_dims == 3);
  1181. int n_tokens = tokens_input->ne[0];
  1182. int n_batch = tokens_input->ne[1];
  1183. GGML_ASSERT(n_tokens == targets->ne[1]);
  1184. GGML_ASSERT(n_batch == targets->ne[2]);
  1185. for (int k=0; k<n_batch; ++k) {
  1186. struct ggml_tensor * tokens_input_k = ggml_view_1d(ctx,
  1187. tokens_input,
  1188. tokens_input->ne[0],
  1189. k*tokens_input->nb[1]);
  1190. struct ggml_tensor * targets_k = ggml_view_2d(ctx,
  1191. targets,
  1192. targets->ne[0],
  1193. targets->ne[1],
  1194. targets->nb[1],
  1195. k*targets->nb[2]);
  1196. get_example_targets(example_id*n_batch + k, tokens_input_k, targets_k);
  1197. }
  1198. }
  1199. void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
  1200. int n_tokens = tokens_input->ne[0];
  1201. int n_vocab = targets->ne[0];
  1202. for (int i=0; i<n_tokens-n_shift; ++i) {
  1203. ggml_set_i32_1d(tokens_input, i, ggml_get_i32_1d(tokens_input, i + n_shift));
  1204. for (int k=0; k<n_vocab; ++k) {
  1205. ggml_set_f32_1d(targets, i*n_vocab + k, ggml_get_f32_1d(targets, (i + n_shift)*n_vocab + k));
  1206. }
  1207. }
  1208. }
  1209. struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
  1210. // todo: instead of a-b: a[1:]-b[:-1]
  1211. return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
  1212. }
  1213. struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
  1214. const float eps = 1e-3f;
  1215. return
  1216. ggml_sum(ctx,
  1217. ggml_neg(ctx,
  1218. ggml_sum_rows(ctx,
  1219. ggml_mul(ctx,
  1220. ggml_soft_max(ctx, a),
  1221. ggml_log(ctx,
  1222. ggml_add1(ctx,
  1223. ggml_soft_max(ctx, b),
  1224. ggml_new_f32(ctx, eps)))))));
  1225. }
  1226. int main(int argc, char ** argv) {
  1227. if (argc < 1) {
  1228. fprintf(stderr, "usage: %s\n", argv[0]);
  1229. return 1;
  1230. }
  1231. struct ggml_init_params lcparams;
  1232. lcparams.mem_size = 1024ll*1024ll*1024ll;
  1233. lcparams.mem_buffer = NULL;
  1234. lcparams.no_alloc = false;
  1235. struct llama_model model;
  1236. model.hparams.n_vocab = 8;
  1237. model.hparams.n_ctx = 8;
  1238. model.hparams.n_embd = 32;
  1239. model.hparams.n_mult = 2;
  1240. model.hparams.n_head = 8;
  1241. model.hparams.n_layer = 1;
  1242. model.hparams.n_rot = std::min(16u, model.hparams.n_embd / model.hparams.n_head);
  1243. // model.hparams.n_embd = 32;
  1244. // model.hparams.n_mult = 2;
  1245. // model.hparams.n_head = 4;
  1246. // model.hparams.n_layer = 8;
  1247. // model.hparams.n_rot = 8;
  1248. model.ctx = ggml_init(lcparams);
  1249. printf("init model\n");
  1250. init_model(&model);
  1251. set_param_model(&model);
  1252. randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
  1253. /*
  1254. struct llama_model_lora model_lora;
  1255. // model.hparams.n_vocab = 6;
  1256. // model.hparams.n_ctx = 64;
  1257. // model.hparams.n_embd = 128;
  1258. // model.hparams.n_mult = 2;
  1259. // model.hparams.n_head = 8;
  1260. // model.hparams.n_layer = 6;
  1261. // model.hparams.n_rot = model.hparams.n_embd / model.hparams.n_head;
  1262. model_lora.hparams.n_vocab = 16;
  1263. model_lora.hparams.n_ctx = 32;
  1264. model_lora.hparams.n_embd = 256;
  1265. model_lora.hparams.n_mult = 2;
  1266. model_lora.hparams.n_head = 16;
  1267. model_lora.hparams.n_layer = 1;
  1268. model_lora.hparams.n_lora = 64;
  1269. model_lora.hparams.n_rot = MIN(16, model_lora.hparams.n_embd / model_lora.hparams.n_head);
  1270. // model.hparams.n_rot = (model.hparams.n_embd / model.hparams.n_head) / 2;
  1271. // model.hparams.n_embd = 32;
  1272. // model.hparams.n_mult = 2;
  1273. // model.hparams.n_head = 4;
  1274. // model.hparams.n_layer = 8;
  1275. // model.hparams.n_rot = 8;
  1276. model_lora.ctx = ggml_init(lcparams);
  1277. printf("init model_lora\n");
  1278. init_model_lora(&model_lora);
  1279. set_param_model_lora(&model_lora);
  1280. randomize_model_lora(&model_lora, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
  1281. */
  1282. int n_batch = 8;
  1283. // key + value cache for the self attention
  1284. struct llama_kv_cache kv_self;
  1285. printf("init_kv_cache\n");
  1286. kv_self.ctx = model.ctx;
  1287. init_kv_cache(&kv_self, &model, n_batch);
  1288. //init_kv_cache_lora(&kv_self, &model_lora);
  1289. size_t compute_size = 1024ll*1024ll*1024ll;
  1290. uint8_t * compute_addr = new uint8_t[compute_size];
  1291. int n_examples = 256;
  1292. int n_tokens = model.hparams.n_ctx;
  1293. int n_vocab = model.hparams.n_vocab;
  1294. for (int ex=0; ex<n_examples; ++ex) {
  1295. struct ggml_init_params params = {
  1296. /*.mem_size =*/ compute_size,
  1297. /*.mem_buffer =*/ compute_addr,
  1298. /*.no_alloc =*/ false,
  1299. };
  1300. struct ggml_context * ctx0 = ggml_init(params);
  1301. struct ggml_tensor * after_opt_best_samples = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
  1302. struct ggml_tensor * after_opt_probs = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
  1303. struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
  1304. struct ggml_tensor * targets = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
  1305. int n_past = 0;
  1306. ggml_cgraph gf = {};
  1307. gf.n_threads = 1;
  1308. get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
  1309. struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
  1310. // struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits);
  1311. struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
  1312. ggml_build_forward_expand(&gf, e);
  1313. ggml_graph_compute(ctx0, &gf);
  1314. float error_before_opt = ggml_get_f32_1d(e, 0);
  1315. struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
  1316. struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
  1317. opt_params_adam.print_forward_graph = false;
  1318. opt_params_adam.print_backward_graph = false;
  1319. opt_params_lbfgs.print_forward_graph = false;
  1320. opt_params_lbfgs.print_backward_graph = false;
  1321. opt_params_adam.adam.n_iter = 16;
  1322. opt_params_lbfgs.lbfgs.n_iter = 16;
  1323. // ggml_opt(ctx0, opt_params_adam, e);
  1324. ggml_opt(ctx0, opt_params_lbfgs, e);
  1325. //
  1326. ggml_build_forward_expand(&gf, e);
  1327. ggml_graph_compute(ctx0, &gf);
  1328. float error_after_opt = ggml_get_f32_1d(e, 0);
  1329. if (ex % 8 == 0) {
  1330. printf("Example %d\n", (ex+1));
  1331. printf("error_before_opt: %.2f\n", error_before_opt);
  1332. printf("error_after_opt: %.2f\n", error_after_opt);
  1333. }
  1334. if (ex % 64 == 0) {
  1335. sample_softmax_batch(ctx0, logits, after_opt_probs, after_opt_best_samples);
  1336. // printf("probabilities after optimization:\n");
  1337. // print_matrix(after_opt_probs);
  1338. printf("best samples after optimization:\n");
  1339. print_tokens(after_opt_best_samples, n_vocab);
  1340. }
  1341. ggml_free(ctx0);
  1342. }
  1343. {
  1344. int n_gen = 128;
  1345. int sample_ctx = n_tokens-n_tokens/8;
  1346. printf("Generating %d tokens.\n", n_gen);
  1347. struct ggml_tensor * tokens_input = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens);
  1348. struct ggml_tensor * targets = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens);
  1349. get_example_targets(137, tokens_input, targets);
  1350. for (int i=sample_ctx; i<n_tokens; ++i) {
  1351. ggml_set_i32_1d(tokens_input, i, n_vocab/2);
  1352. }
  1353. for (int i=0; i<sample_ctx-1; ++i) {
  1354. print_token(ggml_get_i32_1d(tokens_input, i), n_vocab);
  1355. }
  1356. printf("---\n");
  1357. for (int i=0; i<n_gen; ++i) {
  1358. struct ggml_init_params params = {
  1359. /*.mem_size =*/ compute_size,
  1360. /*.mem_buffer =*/ compute_addr,
  1361. /*.no_alloc =*/ false,
  1362. };
  1363. struct ggml_context * ctx0 = ggml_init(params);
  1364. ggml_cgraph gf = {};
  1365. gf.n_threads = 1;
  1366. int n_past = 0;
  1367. struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
  1368. ggml_build_forward_expand(&gf, logits);
  1369. ggml_graph_compute(ctx0, &gf);
  1370. struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
  1371. struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
  1372. sample_softmax(logits, probs, best_samples);
  1373. // int sample_at = n_tokens-1;
  1374. int token = ggml_get_i32_1d(best_samples, sample_ctx-1);
  1375. // print_row(probs, sample_at);
  1376. print_token(token, n_vocab);
  1377. lshift_examples(tokens_input, targets, 1);
  1378. ggml_set_i32_1d(tokens_input, 0, 0);
  1379. ggml_set_i32_1d(tokens_input, sample_ctx-1, token);
  1380. ggml_free(ctx0);
  1381. }
  1382. }
  1383. print_matrix(model.tok_embeddings);
  1384. printf("done\n");
  1385. // ggml_free(kv_self.ctx);
  1386. // ggml_free(model_lora.ctx);
  1387. ggml_free(model.ctx);
  1388. return 0;
  1389. }