baby-llama.cpp 65 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708
  1. #include "ggml.h"
  2. #include <vector>
  3. #include <cassert>
  4. #include <random>
  5. #include <cstring>
  6. #if defined(_MSC_VER)
  7. #pragma warning(disable: 4244 4267) // possible loss of data
  8. #endif
  9. float frand() {
  10. return (float)rand()/(float)RAND_MAX;
  11. }
  12. struct random_normal_distribution {
  13. std::mt19937 gen;
  14. std::normal_distribution<float> nd;
  15. float min;
  16. float max;
  17. };
  18. void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
  19. rnd->gen = std::mt19937(seed);
  20. rnd->nd = std::normal_distribution<float>{mean, std};
  21. rnd->min = min;
  22. rnd->max = max;
  23. }
  24. float frand_normal(struct random_normal_distribution * rnd) {
  25. const float r = rnd->nd(rnd->gen);
  26. return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
  27. }
  28. void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
  29. struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
  30. if (plan.work_size > 0) {
  31. buf.resize(plan.work_size);
  32. plan.work_data = buf.data();
  33. }
  34. ggml_graph_compute(graph, &plan);
  35. }
  36. struct ggml_tensor * randomize_tensor(
  37. struct ggml_tensor * tensor,
  38. int ndims,
  39. const int64_t ne[],
  40. float fmin,
  41. float fmax) {
  42. switch (ndims) {
  43. case 1:
  44. for (int i0 = 0; i0 < ne[0]; i0++) {
  45. ((float *)tensor->data)[i0] = frand()*(fmax - fmin) + fmin;
  46. }
  47. break;
  48. case 2:
  49. for (int i1 = 0; i1 < ne[1]; i1++) {
  50. for (int i0 = 0; i0 < ne[0]; i0++) {
  51. ((float *)tensor->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  52. }
  53. }
  54. break;
  55. case 3:
  56. for (int i2 = 0; i2 < ne[2]; i2++) {
  57. for (int i1 = 0; i1 < ne[1]; i1++) {
  58. for (int i0 = 0; i0 < ne[0]; i0++) {
  59. ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  60. }
  61. }
  62. }
  63. break;
  64. case 4:
  65. for (int i3 = 0; i3 < ne[3]; i3++) {
  66. for (int i2 = 0; i2 < ne[2]; i2++) {
  67. for (int i1 = 0; i1 < ne[1]; i1++) {
  68. for (int i0 = 0; i0 < ne[0]; i0++) {
  69. ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  70. }
  71. }
  72. }
  73. }
  74. break;
  75. default:
  76. assert(false);
  77. };
  78. return tensor;
  79. }
  80. struct ggml_tensor * randomize_tensor_normal(
  81. struct ggml_tensor * tensor,
  82. int ndims,
  83. const int64_t ne[],
  84. struct random_normal_distribution * rnd) {
  85. float scale = 1.0; // xavier
  86. switch (ndims) {
  87. case 1:
  88. scale /= sqrtf(ne[0]);
  89. for (int i0 = 0; i0 < ne[0]; i0++) {
  90. ((float *)tensor->data)[i0] = scale * frand_normal(rnd);
  91. }
  92. break;
  93. case 2:
  94. scale /= sqrtf(ne[0]+ne[1]);
  95. for (int i1 = 0; i1 < ne[1]; i1++) {
  96. for (int i0 = 0; i0 < ne[0]; i0++) {
  97. ((float *)tensor->data)[i1*ne[0] + i0] = scale * frand_normal(rnd);
  98. }
  99. }
  100. break;
  101. case 3:
  102. scale /= sqrtf(ne[0]+ne[1]);
  103. for (int i2 = 0; i2 < ne[2]; i2++) {
  104. for (int i1 = 0; i1 < ne[1]; i1++) {
  105. for (int i0 = 0; i0 < ne[0]; i0++) {
  106. ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
  107. }
  108. }
  109. }
  110. break;
  111. case 4:
  112. scale /= sqrtf(ne[0]+ne[1]);
  113. for (int i3 = 0; i3 < ne[3]; i3++) {
  114. for (int i2 = 0; i2 < ne[2]; i2++) {
  115. for (int i1 = 0; i1 < ne[1]; i1++) {
  116. for (int i0 = 0; i0 < ne[0]; i0++) {
  117. ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
  118. }
  119. }
  120. }
  121. }
  122. break;
  123. default:
  124. assert(false);
  125. };
  126. return tensor;
  127. }
  128. struct llama_hparams {
  129. uint32_t n_vocab = 32000;
  130. uint32_t n_ctx = 512; // this is provided as user input?
  131. uint32_t n_embd = 4096;
  132. uint32_t n_mult = 4;
  133. uint32_t n_head = 32;
  134. uint32_t n_layer = 32;
  135. uint32_t n_rot = 64;
  136. bool operator!=(const llama_hparams & other) const {
  137. return memcmp(this, &other, sizeof(llama_hparams));
  138. }
  139. };
  140. uint32_t get_n_ff(const struct llama_hparams* hparams) {
  141. const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
  142. return n_ff;
  143. }
  144. struct llama_hparams_lora {
  145. uint32_t n_vocab = 32000;
  146. uint32_t n_ctx = 512; // this is provided as user input?
  147. uint32_t n_embd = 4096;
  148. uint32_t n_mult = 4;
  149. uint32_t n_head = 32;
  150. uint32_t n_layer = 32;
  151. uint32_t n_rot = 64;
  152. uint32_t n_lora = 64;
  153. bool operator!=(const llama_hparams_lora & other) const {
  154. return memcmp(this, &other, sizeof(llama_hparams_lora)) != 0;
  155. }
  156. };
  157. struct llama_layer {
  158. // normalization
  159. struct ggml_tensor * attention_norm;
  160. // attention
  161. struct ggml_tensor * wq;
  162. struct ggml_tensor * wk;
  163. struct ggml_tensor * wv;
  164. struct ggml_tensor * wo;
  165. // normalization
  166. struct ggml_tensor * ffn_norm;
  167. // ff
  168. struct ggml_tensor * w1;
  169. struct ggml_tensor * w2;
  170. struct ggml_tensor * w3;
  171. };
  172. struct llama_layer_lora {
  173. // normalization
  174. struct ggml_tensor * attention_norm;
  175. // attention
  176. struct ggml_tensor * wqa;
  177. struct ggml_tensor * wqb;
  178. struct ggml_tensor * wka;
  179. struct ggml_tensor * wkb;
  180. struct ggml_tensor * wva;
  181. struct ggml_tensor * wvb;
  182. struct ggml_tensor * woa;
  183. struct ggml_tensor * wob;
  184. // normalization
  185. struct ggml_tensor * ffn_norm;
  186. // ff
  187. struct ggml_tensor * w1;
  188. struct ggml_tensor * w2;
  189. struct ggml_tensor * w3;
  190. };
  191. struct llama_kv_cache {
  192. struct ggml_context * ctx = NULL;
  193. struct ggml_tensor * k;
  194. struct ggml_tensor * v;
  195. // llama_ctx_buffer buf;
  196. int n; // number of tokens currently in the cache
  197. };
  198. struct llama_model {
  199. struct ggml_context * ctx = NULL;
  200. llama_hparams hparams;
  201. struct ggml_tensor * tok_embeddings;
  202. struct ggml_tensor * norm;
  203. struct ggml_tensor * output;
  204. std::vector<llama_layer> layers;
  205. };
  206. struct llama_model_lora {
  207. struct ggml_context * ctx = NULL;
  208. llama_hparams_lora hparams;
  209. struct ggml_tensor * tok_embeddings;
  210. struct ggml_tensor * norm;
  211. struct ggml_tensor * outputa;
  212. struct ggml_tensor * outputb;
  213. std::vector<llama_layer_lora> layers;
  214. };
  215. void init_model(struct llama_model * model) {
  216. const auto & hparams = model->hparams;
  217. const uint32_t n_embd = hparams.n_embd;
  218. const uint32_t n_layer = hparams.n_layer;
  219. const uint32_t n_vocab = hparams.n_vocab;
  220. const uint32_t n_ff = get_n_ff(&hparams);
  221. struct ggml_context * ctx = model->ctx;
  222. model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab});
  223. model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // ("norm.weight", {n_embd});
  224. model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("output.weight", {n_embd, n_vocab});
  225. model->layers.resize(n_layer);
  226. for (uint32_t i = 0; i < n_layer; ++i) {
  227. auto & layer = model->layers[i];
  228. // std::string layers_i = "layers." + std::to_string(i);
  229. layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd});
  230. layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
  231. layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
  232. layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
  233. layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
  234. layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".ffn_norm.weight", {n_embd});
  235. layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
  236. layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); // (layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
  237. layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
  238. }
  239. }
  240. void init_model_lora(struct llama_model_lora * model) {
  241. const auto & hparams = model->hparams;
  242. const uint32_t n_embd = hparams.n_embd;
  243. const uint32_t n_mult = hparams.n_mult;
  244. const uint32_t n_layer = hparams.n_layer;
  245. const uint32_t n_vocab = hparams.n_vocab;
  246. const uint32_t n_lora = hparams.n_lora;
  247. const uint32_t n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult;
  248. struct ggml_context * ctx = model->ctx;
  249. model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab});
  250. model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // ("norm.weight", {n_embd});
  251. model->outputa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_vocab); // ("output.weight", {n_embd, n_vocab});
  252. model->outputb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // ("output.weight", {n_embd, n_vocab});
  253. model->layers.resize(n_layer);
  254. for (uint32_t i = 0; i < n_layer; ++i) {
  255. auto & layer = model->layers[i];
  256. // std::string layers_i = "layers." + std::to_string(i);
  257. layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd});
  258. layer.wqa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
  259. layer.wqb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
  260. layer.wka = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
  261. layer.wkb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
  262. layer.wva = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
  263. layer.wvb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
  264. layer.woa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
  265. layer.wob = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
  266. layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".ffn_norm.weight", {n_embd});
  267. layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
  268. layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); // (layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
  269. layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
  270. }
  271. }
  272. void set_param_model(struct llama_model * model) {
  273. const auto& hparams = model->hparams;
  274. const uint32_t n_layer = hparams.n_layer;
  275. struct ggml_context* ctx = model->ctx;
  276. ggml_set_param(ctx, model->tok_embeddings);
  277. ggml_set_param(ctx, model->norm);
  278. ggml_set_param(ctx, model->output);
  279. for (uint32_t i = 0; i < n_layer; ++i) {
  280. auto & layer = model->layers[i];
  281. ggml_set_param(ctx, layer.attention_norm);
  282. ggml_set_param(ctx, layer.wq);
  283. ggml_set_param(ctx, layer.wk);
  284. ggml_set_param(ctx, layer.wv);
  285. ggml_set_param(ctx, layer.wo);
  286. ggml_set_param(ctx, layer.ffn_norm);
  287. ggml_set_param(ctx, layer.w1);
  288. ggml_set_param(ctx, layer.w2);
  289. ggml_set_param(ctx, layer.w3);
  290. }
  291. }
  292. void set_param_model_lora(struct llama_model_lora * model) {
  293. const auto& hparams = model->hparams;
  294. const uint32_t n_layer = hparams.n_layer;
  295. struct ggml_context* ctx = model->ctx;
  296. ggml_set_param(ctx, model->tok_embeddings);
  297. ggml_set_param(ctx, model->norm);
  298. ggml_set_param(ctx, model->outputa);
  299. ggml_set_param(ctx, model->outputb);
  300. for (uint32_t i = 0; i < n_layer; ++i) {
  301. auto & layer = model->layers[i];
  302. ggml_set_param(ctx, layer.attention_norm);
  303. ggml_set_param(ctx, layer.wqa);
  304. ggml_set_param(ctx, layer.wqb);
  305. ggml_set_param(ctx, layer.wka);
  306. ggml_set_param(ctx, layer.wkb);
  307. ggml_set_param(ctx, layer.wva);
  308. ggml_set_param(ctx, layer.wvb);
  309. ggml_set_param(ctx, layer.woa);
  310. ggml_set_param(ctx, layer.wob);
  311. ggml_set_param(ctx, layer.ffn_norm);
  312. ggml_set_param(ctx, layer.w1);
  313. ggml_set_param(ctx, layer.w2);
  314. ggml_set_param(ctx, layer.w3);
  315. }
  316. }
  317. void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
  318. const auto & hparams = model->hparams;
  319. const uint32_t n_layer = hparams.n_layer;
  320. struct random_normal_distribution rnd;
  321. init_random_normal_distribution(&rnd, seed, mean, std, min, max);
  322. randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
  323. randomize_tensor_normal(model->norm, model->norm->n_dims, model->norm->ne, &rnd);
  324. randomize_tensor_normal(model->output, model->output->n_dims, model->output->ne, &rnd);
  325. for (uint32_t i = 0; i < n_layer; ++i) {
  326. auto & layer = model->layers[i];
  327. randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);
  328. randomize_tensor_normal(layer.wq, layer.wq->n_dims, layer.wq->ne, &rnd);
  329. randomize_tensor_normal(layer.wk, layer.wk->n_dims, layer.wk->ne, &rnd);
  330. randomize_tensor_normal(layer.wv, layer.wv->n_dims, layer.wv->ne, &rnd);
  331. randomize_tensor_normal(layer.wo, layer.wo->n_dims, layer.wo->ne, &rnd);
  332. randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);
  333. randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
  334. randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
  335. randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
  336. }
  337. }
  338. void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, float std, float min, float max) {
  339. const auto & hparams = model->hparams;
  340. const uint32_t n_layer = hparams.n_layer;
  341. struct random_normal_distribution rnd;
  342. init_random_normal_distribution(&rnd, seed, mean, std, min, max);
  343. randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
  344. randomize_tensor_normal(model->norm, model->norm->n_dims, model->norm->ne, &rnd);
  345. randomize_tensor_normal(model->outputa, model->outputa->n_dims, model->outputa->ne, &rnd);
  346. randomize_tensor_normal(model->outputb, model->outputb->n_dims, model->outputb->ne, &rnd);
  347. for (uint32_t i = 0; i < n_layer; ++i) {
  348. auto & layer = model->layers[i];
  349. randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);
  350. randomize_tensor_normal(layer.wqa, layer.wqa->n_dims, layer.wqa->ne, &rnd);
  351. randomize_tensor_normal(layer.wqb, layer.wqb->n_dims, layer.wqb->ne, &rnd);
  352. randomize_tensor_normal(layer.wka, layer.wka->n_dims, layer.wka->ne, &rnd);
  353. randomize_tensor_normal(layer.wkb, layer.wkb->n_dims, layer.wkb->ne, &rnd);
  354. randomize_tensor_normal(layer.wva, layer.wva->n_dims, layer.wva->ne, &rnd);
  355. randomize_tensor_normal(layer.wvb, layer.wvb->n_dims, layer.wvb->ne, &rnd);
  356. randomize_tensor_normal(layer.woa, layer.woa->n_dims, layer.woa->ne, &rnd);
  357. randomize_tensor_normal(layer.wob, layer.wob->n_dims, layer.wob->ne, &rnd);
  358. randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);
  359. randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
  360. randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
  361. randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
  362. }
  363. }
  364. bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
  365. const auto & hparams = model->hparams;
  366. const uint32_t n_ctx = hparams.n_ctx;
  367. const uint32_t n_embd = hparams.n_embd;
  368. const uint32_t n_layer = hparams.n_layer;
  369. const int64_t n_mem = n_layer*n_ctx*n_batch;
  370. const int64_t n_elements = n_embd*n_mem;
  371. // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
  372. // struct ggml_init_params params;
  373. // params.mem_size = cache.buf.size;
  374. // params.mem_buffer = cache.buf.addr;
  375. // params.no_alloc = false;
  376. if (!cache->ctx) {
  377. struct ggml_init_params params;
  378. params.mem_size = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
  379. params.mem_buffer = NULL;
  380. params.no_alloc = false;
  381. cache->ctx = ggml_init(params);
  382. if (!cache->ctx) {
  383. fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
  384. return false;
  385. }
  386. }
  387. cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
  388. cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
  389. return true;
  390. }
  391. bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
  392. const auto & hparams = model->hparams;
  393. const uint32_t n_ctx = hparams.n_ctx;
  394. const uint32_t n_embd = hparams.n_embd;
  395. const uint32_t n_layer = hparams.n_layer;
  396. const int64_t n_mem = n_layer*n_ctx*n_batch;
  397. const int64_t n_elements = n_embd*n_mem;
  398. // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
  399. // struct ggml_init_params params;
  400. // params.mem_size = cache.buf.size;
  401. // params.mem_buffer = cache.buf.addr;
  402. // params.no_alloc = false;
  403. if (!cache->ctx) {
  404. struct ggml_init_params params;
  405. params.mem_size = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
  406. params.mem_buffer = NULL;
  407. params.no_alloc = false;
  408. cache->ctx = ggml_init(params);
  409. if (!cache->ctx) {
  410. fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
  411. return false;
  412. }
  413. }
  414. cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
  415. cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
  416. return true;
  417. }
  418. struct ggml_tensor * forward(
  419. struct llama_model * model,
  420. struct llama_kv_cache * cache,
  421. struct ggml_context * ctx0,
  422. struct ggml_cgraph * gf,
  423. struct ggml_tensor * tokens_input,
  424. const int n_tokens,
  425. const int n_past) {
  426. const int N = n_tokens;
  427. struct llama_kv_cache& kv_self = *cache;
  428. const auto & hparams = model->hparams;
  429. const int n_ctx = hparams.n_ctx;
  430. const int n_embd = hparams.n_embd;
  431. const int n_layer = hparams.n_layer;
  432. const int n_head = hparams.n_head;
  433. const int n_rot = hparams.n_rot;
  434. struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
  435. memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
  436. struct ggml_tensor * kc = kv_self.k;
  437. struct ggml_tensor * vc = kv_self.v;
  438. // inpL shape [n_embd,N,1,1]
  439. struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
  440. for (int il = 0; il < n_layer; ++il) {
  441. struct ggml_tensor * inpSA = inpL;
  442. struct ggml_tensor * cur;
  443. // lctx.use_buf(ctx0, 0);
  444. // norm
  445. {
  446. // cur shape [n_embd,N,1,1]
  447. cur = ggml_rms_norm(ctx0, inpL);
  448. // cur = attention_norm*cur
  449. cur = ggml_mul(ctx0,
  450. ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
  451. cur);
  452. }
  453. // self-attention
  454. {
  455. // compute Q and K and RoPE them
  456. // wq shape [n_embd, n_embd, 1, 1]
  457. // wk shape [n_embd, n_embd, 1, 1]
  458. // Qcur shape [n_embd/n_head, n_head, N, 1]
  459. // Kcur shape [n_embd/n_head, n_head, N, 1]
  460. struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
  461. struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
  462. // store key and value to memory
  463. {
  464. // compute the transposed [N, n_embd] V matrix
  465. // wv shape [n_embd, n_embd, 1, 1]
  466. // Vcur shape [n_embd, N, 1, 1]
  467. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N)));
  468. // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
  469. // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
  470. // k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0]
  471. // v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
  472. /* {
  473. struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
  474. struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
  475. ( n_ctx)*ggml_element_size(kv_self.v),
  476. (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
  477. // important: storing RoPE-ed version of K in the KV cache!
  478. ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
  479. ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
  480. } //*/
  481. kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
  482. vc = ggml_set_2d(ctx0, vc, Vcur, ( n_ctx)*ggml_element_size(kv_self.v),
  483. (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
  484. }
  485. // Qcur shape [n_embd/n_head, n_head, N, 1]
  486. // Q shape [n_embd/n_head, N, n_head, 1]
  487. struct ggml_tensor * Q =
  488. ggml_permute(ctx0,
  489. Qcur,
  490. 0, 2, 1, 3);
  491. // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
  492. // K shape [n_embd/n_head, n_past + N, n_head, 1]
  493. struct ggml_tensor * K =
  494. ggml_permute(ctx0,
  495. ggml_reshape_3d(ctx0,
  496. ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd),
  497. n_embd/n_head, n_head, n_past + N),
  498. 0, 2, 1, 3);
  499. // K * Q
  500. // KQ shape [n_past + N, N, n_head, 1]
  501. struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
  502. // KQ_scaled = KQ / sqrt(n_embd/n_head)
  503. // KQ_scaled shape [n_past + N, N, n_head, 1]
  504. struct ggml_tensor * KQ_scaled =
  505. ggml_scale(ctx0,
  506. KQ,
  507. ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
  508. // KQ_masked = mask_past(KQ_scaled)
  509. // KQ_masked shape [n_past + N, N, n_head, 1]
  510. struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
  511. // KQ = soft_max(KQ_masked)
  512. // KQ_soft_max shape [n_past + N, N, n_head, 1]
  513. struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
  514. // split cached V into n_head heads
  515. //// V shape [n_past + N, n_embd/n_head, n_head, 1]
  516. // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1]
  517. struct ggml_tensor * V =
  518. ggml_view_3d(ctx0, vc,
  519. n_past + N, n_embd/n_head, n_head,
  520. n_ctx*ggml_element_size(vc),
  521. n_ctx*ggml_element_size(vc)*n_embd/n_head,
  522. il*n_ctx*ggml_element_size(vc)*n_embd);
  523. // KQV shape [n_embd/n_head, N, n_head, 1]
  524. struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
  525. // KQV_merged = KQV.permute(0, 2, 1, 3)
  526. // KQV_merged shape [n_embd/n_head, n_head, N, 1]
  527. struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
  528. // KQV_merged shape
  529. // cur = KQV_merged.contiguous().view(n_embd, N)
  530. // cur shape [n_embd,N,1,1]
  531. cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N);
  532. // cur = ggml_cpy(ctx0,
  533. // KQV_merged,
  534. // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
  535. // projection (no bias)
  536. // cur shape [n_embd,N,1,1]
  537. cur = ggml_mul_mat(ctx0,
  538. model->layers[il].wo,
  539. cur);
  540. }
  541. // lctx.use_buf(ctx0, 1);
  542. // inpFF shape [n_embd,N,1,1]
  543. struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
  544. // feed-forward network
  545. {
  546. // norm
  547. {
  548. // cur shape [n_embd,N,1,1]
  549. cur = ggml_rms_norm(ctx0, inpFF);
  550. // cur = ffn_norm*cur
  551. // cur shape [n_embd,N,1,1]
  552. cur = ggml_mul(ctx0,
  553. ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
  554. cur);
  555. }
  556. // tmp shape [n_ff,N,1,1]
  557. struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
  558. model->layers[il].w3,
  559. cur);
  560. // cur shape [n_ff,N,1,1]
  561. cur = ggml_mul_mat(ctx0,
  562. model->layers[il].w1,
  563. cur);
  564. // SILU activation
  565. // cur shape [n_ff,N,1,1]
  566. cur = ggml_silu(ctx0, cur);
  567. // cur shape [n_ff,N,1,1]
  568. cur = ggml_mul(ctx0, cur, tmp);
  569. // cur shape [n_embd,N,1,1]
  570. cur = ggml_mul_mat(ctx0,
  571. model->layers[il].w2,
  572. cur);
  573. }
  574. // cur shape [n_embd,N,1,1]
  575. cur = ggml_add(ctx0, cur, inpFF);
  576. // input for next layer
  577. // inpL shape [n_embd,N,1,1]
  578. inpL = cur;
  579. }
  580. // norm
  581. {
  582. // inpL shape [n_embd,N,1,1]
  583. inpL = ggml_rms_norm(ctx0, inpL);
  584. // inpL = norm*inpL
  585. // inpL shape [n_embd,N,1,1]
  586. inpL = ggml_mul(ctx0,
  587. ggml_repeat(ctx0, model->norm, inpL),
  588. inpL);
  589. //embeddings = inpL;
  590. }
  591. // lm_head
  592. // inpL shape [n_vocab,N,1,1]
  593. inpL = ggml_mul_mat(ctx0, model->output, inpL);
  594. // run the computation
  595. ggml_build_forward_expand(gf, inpL);
  596. return inpL;
  597. }
  598. void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
  599. GGML_ASSERT(tensor->n_dims == 1);
  600. GGML_ASSERT(tensor->ne[0] == ne0);
  601. }
  602. void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
  603. GGML_ASSERT(tensor->n_dims == 2);
  604. GGML_ASSERT(tensor->ne[0] == ne0);
  605. GGML_ASSERT(tensor->ne[1] == ne1);
  606. }
  607. void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
  608. GGML_ASSERT(tensor->n_dims == 3);
  609. GGML_ASSERT(tensor->ne[0] == ne0);
  610. GGML_ASSERT(tensor->ne[1] == ne1);
  611. GGML_ASSERT(tensor->ne[2] == ne2);
  612. }
  613. void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
  614. GGML_ASSERT(tensor->n_dims == 4);
  615. GGML_ASSERT(tensor->ne[0] == ne0);
  616. GGML_ASSERT(tensor->ne[1] == ne1);
  617. GGML_ASSERT(tensor->ne[2] == ne2);
  618. GGML_ASSERT(tensor->ne[3] == ne3);
  619. }
  620. struct ggml_tensor * forward_batch(
  621. struct llama_model * model,
  622. struct llama_kv_cache * cache,
  623. struct ggml_context * ctx0,
  624. struct ggml_cgraph * gf,
  625. struct ggml_tensor * tokens_input,
  626. const int n_tokens,
  627. const int n_past,
  628. const int n_batch) {
  629. const int N = n_tokens;
  630. struct llama_kv_cache& kv_self = *cache;
  631. const auto & hparams = model->hparams;
  632. const int n_ctx = hparams.n_ctx;
  633. const int n_vocab = hparams.n_vocab;
  634. const int n_embd = hparams.n_embd;
  635. const int n_layer = hparams.n_layer;
  636. const int n_head = hparams.n_head;
  637. const int n_rot = hparams.n_rot;
  638. const int n_ff = get_n_ff(&hparams);
  639. struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch);
  640. memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch);
  641. struct ggml_tensor * kc = kv_self.k;
  642. struct ggml_tensor * vc = kv_self.v;
  643. // inpL shape [n_embd,N*n_batch,1]
  644. struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
  645. assert_shape_2d(inpL, n_embd, N*n_batch);
  646. for (int il = 0; il < n_layer; ++il) {
  647. struct ggml_tensor * inpSA = inpL;
  648. struct ggml_tensor * cur;
  649. // lctx.use_buf(ctx0, 0);
  650. // norm
  651. {
  652. // cur shape [n_embd,N*n_batch,1,1]
  653. cur = ggml_rms_norm(ctx0, inpL);
  654. assert_shape_2d(cur, n_embd, N*n_batch);
  655. // cur = attention_norm*cur
  656. cur = ggml_mul(ctx0,
  657. ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
  658. cur);
  659. assert_shape_2d(cur, n_embd, N*n_batch);
  660. }
  661. // self-attention
  662. {
  663. // compute Q and K and RoPE them
  664. // wq shape [n_embd, n_embd, 1, 1]
  665. // wk shape [n_embd, n_embd, 1, 1]
  666. // Qcur shape [n_embd/n_head, n_head, N, n_batch]
  667. // Kcur shape [n_embd/n_head, n_head, N, n_batch]
  668. struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
  669. struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
  670. assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
  671. assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
  672. // store key and value to memory
  673. {
  674. // compute the transposed [N, n_embd] V matrix
  675. // wv shape [n_embd, n_embd, 1, 1]
  676. // Vcur shape [N, n_embd, n_batch, 1]
  677. struct ggml_tensor * Vcur = ggml_cont(ctx0,
  678. ggml_permute(ctx0,
  679. ggml_reshape_3d(ctx0,
  680. ggml_mul_mat(ctx0,
  681. model->layers[il].wv,
  682. cur),
  683. n_embd, N, n_batch),
  684. 1, 0, 2, 3));
  685. assert_shape_3d(Vcur, N, n_embd, n_batch);
  686. // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
  687. // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
  688. // k shape [n_embd * N, n_batch] == kv_self.k[:,n_past:n_past+N,:,il]
  689. // v shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il]
  690. /* {
  691. struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
  692. struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
  693. ( n_ctx)*ggml_element_size(kv_self.v),
  694. (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
  695. // important: storing RoPE-ed version of K in the KV cache!
  696. ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
  697. ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
  698. } //*/
  699. kc = ggml_set_2d(ctx0, kc,
  700. ggml_reshape_2d(ctx0, Kcur, n_embd*N, n_batch),
  701. ggml_element_size(kc)*n_embd*n_ctx,
  702. (ggml_element_size(kc)*n_embd)*(il*n_batch*n_ctx + n_past));
  703. vc = ggml_set_2d(ctx0, vc,
  704. ggml_reshape_2d(ctx0, Vcur, N*n_embd, n_batch),
  705. ggml_element_size(vc)*n_ctx*n_embd,
  706. ggml_element_size(vc)*(n_past + il*n_embd*n_batch*n_ctx));
  707. assert_shape_1d(kc, n_embd * n_ctx * n_batch * n_layer);
  708. assert_shape_1d(vc, n_embd * n_ctx * n_batch * n_layer);
  709. }
  710. // Qcur shape [n_embd/n_head, n_head, N, n_batch]
  711. // Q shape [n_embd/n_head, N, n_head, n_batch]
  712. struct ggml_tensor * Q =
  713. ggml_permute(ctx0,
  714. Qcur,
  715. 0, 2, 1, 3);
  716. assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
  717. // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
  718. // K shape [n_embd/n_head, n_past + N, n_head, n_batch]
  719. struct ggml_tensor * K =
  720. ggml_permute(ctx0,
  721. ggml_reshape_4d(ctx0,
  722. ggml_view_3d(ctx0,
  723. kc,
  724. n_embd,
  725. (n_past + N),
  726. n_batch,
  727. n_embd*ggml_element_size(kc),
  728. n_ctx*n_embd*ggml_element_size(kc),
  729. il*n_batch*n_ctx*n_embd*ggml_element_size(kc)),
  730. n_embd/n_head, n_head, n_past + N, n_batch),
  731. 0, 2, 1, 3);
  732. assert_shape_4d(K, n_embd/n_head, n_past + N, n_head, n_batch);
  733. // K * Q
  734. // KQ shape [n_past + N, N, n_head, n_batch]
  735. struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
  736. assert_shape_4d(KQ, n_past + N, N, n_head, n_batch);
  737. // KQ_scaled = KQ / sqrt(n_embd/n_head)
  738. // KQ_scaled shape [n_past + N, N, n_head, n_batch]
  739. struct ggml_tensor * KQ_scaled =
  740. ggml_scale(ctx0,
  741. KQ,
  742. ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
  743. assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch);
  744. // KQ_masked = mask_past(KQ_scaled)
  745. // KQ_masked shape [n_past + N, N, n_head, n_batch]
  746. struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
  747. assert_shape_4d(KQ_masked, n_past + N, N, n_head, n_batch);
  748. // KQ = soft_max(KQ_masked)
  749. // KQ_soft_max shape [n_past + N, N, n_head, n_batch]
  750. struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
  751. assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch);
  752. // split cached V into n_head heads
  753. // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
  754. // V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il]
  755. struct ggml_tensor * V =
  756. ggml_view_4d(ctx0, vc,
  757. n_past + N, n_embd/n_head, n_head, n_batch,
  758. ggml_element_size(vc)*n_ctx,
  759. ggml_element_size(vc)*n_ctx*n_embd/n_head,
  760. ggml_element_size(vc)*n_ctx*n_embd,
  761. il*n_batch*n_ctx*n_embd*ggml_element_size(vc));
  762. assert_shape_4d(V, n_past + N, n_embd/n_head, n_head, n_batch);
  763. // KQV shape [n_embd/n_head, N, n_head, n_batch]
  764. struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
  765. assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch);
  766. // KQV_merged = KQV.permute(0, 2, 1, 3)
  767. // KQV_merged shape [n_embd/n_head, n_head, N, n_batch]
  768. struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
  769. assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch);
  770. // KQV_merged shape
  771. // cur = KQV_merged.contiguous().view(n_embd, N)
  772. // cur shape [n_embd,N*n_batch,1,1]
  773. cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch);
  774. assert_shape_2d(cur, n_embd, N*n_batch);
  775. // cur = ggml_cpy(ctx0,
  776. // KQV_merged,
  777. // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
  778. // projection (no bias)
  779. // cur shape [n_embd,N*n_batch,1,1]
  780. cur = ggml_mul_mat(ctx0,
  781. model->layers[il].wo,
  782. cur);
  783. assert_shape_2d(cur, n_embd, N*n_batch);
  784. }
  785. // lctx.use_buf(ctx0, 1);
  786. // inpFF shape [n_embd,N*n_batch,1,1]
  787. struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
  788. assert_shape_2d(inpFF, n_embd, N*n_batch);
  789. // feed-forward network
  790. {
  791. // norm
  792. {
  793. // cur shape [n_embd,N*n_batch,1,1]
  794. cur = ggml_rms_norm(ctx0, inpFF);
  795. assert_shape_2d(cur, n_embd, N*n_batch);
  796. // cur = ffn_norm*cur
  797. // cur shape [n_embd,N*n_batch,1,1]
  798. cur = ggml_mul(ctx0,
  799. ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
  800. cur);
  801. assert_shape_2d(cur, n_embd, N*n_batch);
  802. }
  803. // tmp shape [n_ff,N*n_batch,1,1]
  804. struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
  805. model->layers[il].w3,
  806. cur);
  807. assert_shape_2d(tmp, n_ff, N*n_batch);
  808. // cur shape [n_ff,N*n_batch,1,1]
  809. cur = ggml_mul_mat(ctx0,
  810. model->layers[il].w1,
  811. cur);
  812. assert_shape_2d(cur, n_ff, N*n_batch);
  813. // SILU activation
  814. // cur shape [n_ff,N*n_batch,1,1]
  815. cur = ggml_silu(ctx0, cur);
  816. assert_shape_2d(cur, n_ff, N*n_batch);
  817. // cur shape [n_ff,N*n_batch,1,1]
  818. cur = ggml_mul(ctx0, cur, tmp);
  819. assert_shape_2d(cur, n_ff, N*n_batch);
  820. // cur shape [n_embd,N*n_batch,1,1]
  821. cur = ggml_mul_mat(ctx0,
  822. model->layers[il].w2,
  823. cur);
  824. assert_shape_2d(cur, n_embd, N*n_batch);
  825. }
  826. // cur shape [n_embd,N*n_batch,1,1]
  827. cur = ggml_add(ctx0, cur, inpFF);
  828. assert_shape_2d(cur, n_embd, N*n_batch);
  829. // input for next layer
  830. // inpL shape [n_embd,N*n_batch,1,1]
  831. inpL = cur;
  832. assert_shape_2d(inpL, n_embd, N*n_batch);
  833. }
  834. // norm
  835. {
  836. // inpL shape [n_embd,N*n_batch,1,1]
  837. inpL = ggml_rms_norm(ctx0, inpL);
  838. assert_shape_2d(inpL, n_embd, N*n_batch);
  839. // inpL = norm*inpL
  840. // inpL shape [n_embd,N*n_batch,1,1]
  841. inpL = ggml_mul(ctx0,
  842. ggml_repeat(ctx0, model->norm, inpL),
  843. inpL);
  844. assert_shape_2d(inpL, n_embd, N*n_batch);
  845. //embeddings = inpL;
  846. }
  847. // lm_head
  848. // inpL shape [n_vocab,N*n_batch,1,1]
  849. inpL = ggml_mul_mat(ctx0, model->output, inpL);
  850. assert_shape_2d(inpL, n_vocab, N*n_batch);
  851. {
  852. // inpL shape [n_vocab,N,n_batch,1]
  853. inpL = ggml_reshape_3d(ctx0,
  854. inpL,
  855. n_vocab, N, n_batch);
  856. assert_shape_3d(inpL, n_vocab, N, n_batch);
  857. }
  858. // run the computation
  859. ggml_build_forward_expand(gf, inpL);
  860. return inpL;
  861. }
  862. struct ggml_tensor * forward_lora(
  863. struct llama_model_lora * model,
  864. struct llama_kv_cache * cache,
  865. struct ggml_context * ctx0,
  866. struct ggml_cgraph * gf,
  867. struct ggml_tensor * tokens_input,
  868. const int n_tokens,
  869. const int n_past) {
  870. const int N = n_tokens;
  871. struct llama_kv_cache& kv_self = *cache;
  872. const auto & hparams = model->hparams;
  873. const int n_ctx = hparams.n_ctx;
  874. const int n_embd = hparams.n_embd;
  875. const int n_layer = hparams.n_layer;
  876. const int n_head = hparams.n_head;
  877. const int n_rot = hparams.n_rot;
  878. struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
  879. memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
  880. struct ggml_tensor * kc = kv_self.k;
  881. struct ggml_tensor * vc = kv_self.v;
  882. // inpL shape [n_embd,N,1,1]
  883. struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
  884. for (int il = 0; il < n_layer; ++il) {
  885. struct ggml_tensor * inpSA = inpL;
  886. struct ggml_tensor * cur;
  887. // norm
  888. {
  889. // cur shape [n_embd,N,1,1]
  890. cur = ggml_rms_norm(ctx0, inpL);
  891. // cur = attention_norm*cur
  892. cur = ggml_mul(ctx0,
  893. ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
  894. cur);
  895. }
  896. // self-attention
  897. {
  898. // compute Q and K and RoPE them
  899. // wq shape [n_embd, n_embd, 1, 1]
  900. // wk shape [n_embd, n_embd, 1, 1]
  901. // Qcur shape [n_embd/n_head, n_head, N, 1]
  902. // Kcur shape [n_embd/n_head, n_head, N, 1]
  903. struct ggml_tensor * Qcur = ggml_rope(ctx0,
  904. ggml_reshape_3d(ctx0,
  905. ggml_mul_mat(ctx0,
  906. model->layers[il].wqa,
  907. ggml_mul_mat(ctx0,
  908. model->layers[il].wqb,
  909. cur)),
  910. n_embd/n_head, n_head, N),
  911. n_past, n_rot, 0, 0);
  912. struct ggml_tensor * Kcur = ggml_rope(ctx0,
  913. ggml_reshape_3d(ctx0,
  914. ggml_mul_mat(ctx0,
  915. model->layers[il].wka,
  916. ggml_mul_mat(ctx0,
  917. model->layers[il].wkb,
  918. cur)),
  919. n_embd/n_head, n_head, N),
  920. n_past, n_rot, 0, 0);
  921. // store key and value to memory
  922. {
  923. // compute the transposed [N, n_embd] V matrix
  924. // wv shape [n_embd, n_embd, 1, 1]
  925. // Vcur shape [n_embd, N, 1, 1]
  926. struct ggml_tensor * Vcur = ggml_cont(ctx0,
  927. ggml_transpose(ctx0,
  928. ggml_reshape_2d(ctx0,
  929. ggml_mul_mat(ctx0,
  930. model->layers[il].wva,
  931. ggml_mul_mat(ctx0,
  932. model->layers[il].wvb,
  933. cur)),
  934. n_embd, N)));
  935. // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
  936. // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
  937. // k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0]
  938. // v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
  939. /* {
  940. struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
  941. struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
  942. ( n_ctx)*ggml_element_size(kv_self.v),
  943. (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
  944. // important: storing RoPE-ed version of K in the KV cache!
  945. ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
  946. ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
  947. } //*/
  948. kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
  949. vc = ggml_set_2d(ctx0, vc, Vcur, ( n_ctx)*ggml_element_size(kv_self.v),
  950. (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
  951. }
  952. // Qcur shape [n_embd/n_head, n_head, N, 1]
  953. // Q shape [n_embd/n_head, N, n_head, 1]
  954. struct ggml_tensor * Q =
  955. ggml_permute(ctx0,
  956. Qcur,
  957. 0, 2, 1, 3);
  958. // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
  959. // K shape [n_embd/n_head, n_past + N, n_head, 1]
  960. struct ggml_tensor * K =
  961. ggml_permute(ctx0,
  962. ggml_reshape_3d(ctx0,
  963. ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd),
  964. n_embd/n_head, n_head, n_past + N),
  965. 0, 2, 1, 3);
  966. // K * Q
  967. // KQ shape [n_past + N, N, n_head, 1]
  968. struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
  969. // KQ_scaled = KQ / sqrt(n_embd/n_head)
  970. // KQ_scaled shape [n_past + N, N, n_head, 1]
  971. struct ggml_tensor * KQ_scaled =
  972. ggml_scale(ctx0,
  973. KQ,
  974. ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
  975. // KQ_masked = mask_past(KQ_scaled)
  976. // KQ_masked shape [n_past + N, N, n_head, 1]
  977. struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
  978. // KQ = soft_max(KQ_masked)
  979. // KQ_soft_max shape [n_past + N, N, n_head, 1]
  980. struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
  981. // split cached V into n_head heads
  982. //// V shape [n_past + N, n_embd/n_head, n_head, 1]
  983. // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1]
  984. struct ggml_tensor * V =
  985. ggml_view_3d(ctx0, vc,
  986. n_past + N, n_embd/n_head, n_head,
  987. n_ctx*ggml_element_size(vc),
  988. n_ctx*ggml_element_size(vc)*n_embd/n_head,
  989. il*n_ctx*ggml_element_size(vc)*n_embd);
  990. // KQV shape [n_embd/n_head, N, n_head, 1]
  991. struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
  992. // KQV_merged = KQV.permute(0, 2, 1, 3)
  993. // KQV_merged shape [n_embd/n_head, n_head, N, 1]
  994. struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
  995. // KQV_merged shape
  996. // cur = KQV_merged.contiguous().view(n_embd, N)
  997. // cur shape [n_embd,N,1,1]
  998. cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N);
  999. // cur = ggml_cpy(ctx0,
  1000. // KQV_merged,
  1001. // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
  1002. // projection (no bias)
  1003. // cur shape [n_embd,N,1,1]
  1004. cur = ggml_mul_mat(ctx0,
  1005. model->layers[il].woa,
  1006. ggml_mul_mat(ctx0,
  1007. model->layers[il].wob,
  1008. cur));
  1009. }
  1010. // inpFF shape [n_embd,N,1,1]
  1011. struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
  1012. // feed-forward network
  1013. {
  1014. // norm
  1015. {
  1016. // cur shape [n_embd,N,1,1]
  1017. cur = ggml_rms_norm(ctx0, inpFF);
  1018. // cur = ffn_norm*cur
  1019. // cur shape [n_embd,N,1,1]
  1020. cur = ggml_mul(ctx0,
  1021. ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
  1022. cur);
  1023. }
  1024. // tmp shape [n_ff,N,1,1]
  1025. struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
  1026. model->layers[il].w3,
  1027. cur);
  1028. // cur shape [n_ff,N,1,1]
  1029. cur = ggml_mul_mat(ctx0,
  1030. model->layers[il].w1,
  1031. cur);
  1032. // SILU activation
  1033. // cur shape [n_ff,N,1,1]
  1034. cur = ggml_silu(ctx0, cur);
  1035. // cur shape [n_ff,N,1,1]
  1036. cur = ggml_mul(ctx0, cur, tmp);
  1037. // cur shape [n_embd,N,1,1]
  1038. cur = ggml_mul_mat(ctx0,
  1039. model->layers[il].w2,
  1040. cur);
  1041. }
  1042. // cur shape [n_embd,N,1,1]
  1043. cur = ggml_add(ctx0, cur, inpFF);
  1044. // input for next layer
  1045. // inpL shape [n_embd,N,1,1]
  1046. inpL = cur;
  1047. }
  1048. // norm
  1049. {
  1050. // inpL shape [n_embd,N,1,1]
  1051. inpL = ggml_rms_norm(ctx0, inpL);
  1052. // inpL = norm*inpL
  1053. // inpL shape [n_embd,N,1,1]
  1054. inpL = ggml_mul(ctx0,
  1055. ggml_repeat(ctx0, model->norm, inpL),
  1056. inpL);
  1057. //embeddings = inpL;
  1058. }
  1059. // lm_head
  1060. // inpL shape [n_vocab,N,1,1]
  1061. inpL = ggml_mul_mat(ctx0,
  1062. model->outputa,
  1063. ggml_mul_mat(ctx0,
  1064. model->outputb,
  1065. inpL));
  1066. // ggml_set_scratch(ctx0, { 0, 0, nullptr, });
  1067. // run the computation
  1068. ggml_build_forward_expand(gf, inpL);
  1069. return inpL;
  1070. }
  1071. void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
  1072. assert(logits->n_dims == 2);
  1073. assert(probs->n_dims == 2);
  1074. assert(best_samples->n_dims == 1);
  1075. assert(logits->ne[1] == best_samples->ne[0]);
  1076. assert(logits->ne[0] == probs->ne[0]);
  1077. assert(logits->ne[1] == probs->ne[1]);
  1078. for (int i = 0; i < logits->ne[1]; ++i) {
  1079. float max_logit = ggml_get_f32_1d(logits, i * logits->ne[0]);
  1080. ggml_set_i32_1d(best_samples, i, 0);
  1081. for (int k = 0; k < logits->ne[0]; ++k) {
  1082. float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k);
  1083. if (logit > max_logit) {
  1084. max_logit = logit;
  1085. ggml_set_i32_1d(best_samples, i, k);
  1086. }
  1087. }
  1088. float psum = 0;
  1089. for (int k = 0; k < logits->ne[0]; ++k) {
  1090. float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k);
  1091. float p = (logit == -INFINITY) ? 0 : expf(logit - max_logit);
  1092. psum += p;
  1093. ggml_set_f32_1d(probs, i * probs->ne[0] + k, p);
  1094. }
  1095. for (int k = 0; k < logits->ne[0]; ++k) {
  1096. float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
  1097. ggml_set_f32_1d(probs, i * probs->ne[0] + k, p / psum);
  1098. }
  1099. }
  1100. }
  1101. void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
  1102. GGML_ASSERT(best_samples->n_dims == 2);
  1103. GGML_ASSERT(logits->n_dims == 3);
  1104. GGML_ASSERT(probs->n_dims == 3);
  1105. int n_tokens = best_samples->ne[0];
  1106. int n_batch = best_samples->ne[1];
  1107. int n_vocab = logits->ne[0];
  1108. GGML_ASSERT(n_tokens == logits->ne[1]);
  1109. GGML_ASSERT(n_batch == logits->ne[2]);
  1110. GGML_ASSERT(n_vocab == probs->ne[0]);
  1111. GGML_ASSERT(n_tokens == probs->ne[1]);
  1112. GGML_ASSERT(n_batch == probs->ne[2]);
  1113. for (int k = 0; k < n_batch; ++k) {
  1114. struct ggml_tensor * best_samples_k = ggml_view_1d(ctx,
  1115. best_samples,
  1116. best_samples->ne[0],
  1117. k*best_samples->nb[1]);
  1118. struct ggml_tensor * logits_k = ggml_view_2d(ctx,
  1119. logits,
  1120. logits->ne[0],
  1121. logits->ne[1],
  1122. logits->nb[1],
  1123. k*logits->nb[2]);
  1124. struct ggml_tensor * probs_k = ggml_view_2d(ctx,
  1125. probs,
  1126. probs->ne[0],
  1127. probs->ne[1],
  1128. probs->nb[1],
  1129. k*probs->nb[2]);
  1130. sample_softmax(logits_k, probs_k, best_samples_k);
  1131. }
  1132. }
  1133. void print_row(struct ggml_tensor * probs, int i) {
  1134. for (int k = 0; k < probs->ne[0]; ++k) {
  1135. float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
  1136. printf(" %.2f", p);
  1137. }
  1138. printf("\n");
  1139. }
  1140. void print_matrix(struct ggml_tensor * probs) {
  1141. assert(probs->n_dims == 2);
  1142. for (int i = 0; i < probs->ne[1]; ++i) {
  1143. for (int k = 0; k < probs->ne[0]; ++k) {
  1144. float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
  1145. printf(" %.2f", p);
  1146. }
  1147. printf("\n");
  1148. }
  1149. }
  1150. void print_token(int token, int n_vocab) {
  1151. for (int k = 0; k < token; ++k) {
  1152. printf(" ");
  1153. }
  1154. printf("X");
  1155. for (int k = token+1; k < n_vocab; ++k) {
  1156. printf(" ");
  1157. }
  1158. printf("\n");
  1159. }
  1160. void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
  1161. for (int i=0; i<tokens->ne[0]; ++i) {
  1162. int token = ggml_get_i32_1d(tokens, i);
  1163. print_token(token, n_vocab);
  1164. }
  1165. }
  1166. void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
  1167. int n_tokens = tokens_input->ne[0];
  1168. int n_vocab = targets->ne[0];
  1169. float randomness = 0.0f;
  1170. // ggml_set_zero(targets);
  1171. ggml_set_f32(targets, -1.0f);
  1172. ggml_set_i32_1d(tokens_input, 0, 0);
  1173. for (int i=1; i<n_tokens+1; ++i) {
  1174. float x = example_id + i * 3.14159f * 2.0f * 1.0f * 0.5f / n_tokens;
  1175. float y = sinf(x);//*cosf(x*1.1f+1.0f);
  1176. float z = (y+1.0f)*0.5f; // scale to [0..1]
  1177. z += (frand()-0.5f)*(randomness/n_vocab);
  1178. z = (z < 0.0f) ? 0.0f : (z > 1.0f) ? 1.0f : z; // clamp to [0..1]
  1179. int token = std::max(1,std::min(1+(int)(z*(float)(n_vocab-1)), n_vocab-1));
  1180. ggml_set_f32_1d(targets, (i-1)*n_vocab + token, +1.0f);
  1181. if (i<n_tokens) {
  1182. ggml_set_i32_1d(tokens_input, i, token);
  1183. }
  1184. }
  1185. }
  1186. void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
  1187. GGML_ASSERT(tokens_input->n_dims == 2);
  1188. GGML_ASSERT( targets->n_dims == 3);
  1189. int n_tokens = tokens_input->ne[0];
  1190. int n_batch = tokens_input->ne[1];
  1191. GGML_ASSERT(n_tokens == targets->ne[1]);
  1192. GGML_ASSERT(n_batch == targets->ne[2]);
  1193. for (int k=0; k<n_batch; ++k) {
  1194. struct ggml_tensor * tokens_input_k = ggml_view_1d(ctx,
  1195. tokens_input,
  1196. tokens_input->ne[0],
  1197. k*tokens_input->nb[1]);
  1198. struct ggml_tensor * targets_k = ggml_view_2d(ctx,
  1199. targets,
  1200. targets->ne[0],
  1201. targets->ne[1],
  1202. targets->nb[1],
  1203. k*targets->nb[2]);
  1204. get_example_targets(example_id*n_batch + k, tokens_input_k, targets_k);
  1205. }
  1206. }
  1207. void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
  1208. int n_tokens = tokens_input->ne[0];
  1209. int n_vocab = targets->ne[0];
  1210. for (int i=0; i<n_tokens-n_shift; ++i) {
  1211. ggml_set_i32_1d(tokens_input, i, ggml_get_i32_1d(tokens_input, i + n_shift));
  1212. for (int k=0; k<n_vocab; ++k) {
  1213. ggml_set_f32_1d(targets, i*n_vocab + k, ggml_get_f32_1d(targets, (i + n_shift)*n_vocab + k));
  1214. }
  1215. }
  1216. }
  1217. struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
  1218. // todo: instead of a-b: a[1:]-b[:-1]
  1219. return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
  1220. }
  1221. struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
  1222. const float eps = 1e-3f;
  1223. return
  1224. ggml_sum(ctx,
  1225. ggml_neg(ctx,
  1226. ggml_sum_rows(ctx,
  1227. ggml_mul(ctx,
  1228. ggml_soft_max(ctx, a),
  1229. ggml_log(ctx,
  1230. ggml_add1(ctx,
  1231. ggml_soft_max(ctx, b),
  1232. ggml_new_f32(ctx, eps)))))));
  1233. }
  1234. int main(int argc, char ** argv) {
  1235. if (argc < 1) {
  1236. fprintf(stderr, "usage: %s\n", argv[0]);
  1237. return 1;
  1238. }
  1239. struct ggml_init_params lcparams;
  1240. lcparams.mem_size = 1024ll*1024ll*1024ll;
  1241. lcparams.mem_buffer = NULL;
  1242. lcparams.no_alloc = false;
  1243. struct llama_model model;
  1244. model.hparams.n_vocab = 8;
  1245. model.hparams.n_ctx = 8;
  1246. model.hparams.n_embd = 32;
  1247. model.hparams.n_mult = 2;
  1248. model.hparams.n_head = 8;
  1249. model.hparams.n_layer = 1;
  1250. model.hparams.n_rot = std::min(16u, model.hparams.n_embd / model.hparams.n_head);
  1251. // model.hparams.n_embd = 32;
  1252. // model.hparams.n_mult = 2;
  1253. // model.hparams.n_head = 4;
  1254. // model.hparams.n_layer = 8;
  1255. // model.hparams.n_rot = 8;
  1256. model.ctx = ggml_init(lcparams);
  1257. printf("init model\n");
  1258. init_model(&model);
  1259. set_param_model(&model);
  1260. randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
  1261. /*
  1262. struct llama_model_lora model_lora;
  1263. // model.hparams.n_vocab = 6;
  1264. // model.hparams.n_ctx = 64;
  1265. // model.hparams.n_embd = 128;
  1266. // model.hparams.n_mult = 2;
  1267. // model.hparams.n_head = 8;
  1268. // model.hparams.n_layer = 6;
  1269. // model.hparams.n_rot = model.hparams.n_embd / model.hparams.n_head;
  1270. model_lora.hparams.n_vocab = 16;
  1271. model_lora.hparams.n_ctx = 32;
  1272. model_lora.hparams.n_embd = 256;
  1273. model_lora.hparams.n_mult = 2;
  1274. model_lora.hparams.n_head = 16;
  1275. model_lora.hparams.n_layer = 1;
  1276. model_lora.hparams.n_lora = 64;
  1277. model_lora.hparams.n_rot = MIN(16, model_lora.hparams.n_embd / model_lora.hparams.n_head);
  1278. // model.hparams.n_rot = (model.hparams.n_embd / model.hparams.n_head) / 2;
  1279. // model.hparams.n_embd = 32;
  1280. // model.hparams.n_mult = 2;
  1281. // model.hparams.n_head = 4;
  1282. // model.hparams.n_layer = 8;
  1283. // model.hparams.n_rot = 8;
  1284. model_lora.ctx = ggml_init(lcparams);
  1285. printf("init model_lora\n");
  1286. init_model_lora(&model_lora);
  1287. set_param_model_lora(&model_lora);
  1288. randomize_model_lora(&model_lora, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
  1289. */
  1290. int n_batch = 8;
  1291. // key + value cache for the self attention
  1292. struct llama_kv_cache kv_self;
  1293. printf("init_kv_cache\n");
  1294. kv_self.ctx = model.ctx;
  1295. init_kv_cache(&kv_self, &model, n_batch);
  1296. //init_kv_cache_lora(&kv_self, &model_lora);
  1297. size_t compute_size = 1024ll*1024ll*1024ll;
  1298. uint8_t * compute_addr = new uint8_t[compute_size];
  1299. int n_examples = 256;
  1300. int n_tokens = model.hparams.n_ctx;
  1301. int n_vocab = model.hparams.n_vocab;
  1302. std::vector<uint8_t> work_buffer;
  1303. for (int ex=0; ex<n_examples; ++ex) {
  1304. struct ggml_init_params params = {
  1305. /*.mem_size =*/ compute_size,
  1306. /*.mem_buffer =*/ compute_addr,
  1307. /*.no_alloc =*/ false,
  1308. };
  1309. struct ggml_context * ctx0 = ggml_init(params);
  1310. struct ggml_tensor * after_opt_best_samples = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
  1311. struct ggml_tensor * after_opt_probs = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
  1312. struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
  1313. struct ggml_tensor * targets = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
  1314. int n_past = 0;
  1315. ggml_cgraph gf = {};
  1316. get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
  1317. struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
  1318. // struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits);
  1319. struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
  1320. ggml_build_forward_expand(&gf, e);
  1321. ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
  1322. float error_before_opt = ggml_get_f32_1d(e, 0);
  1323. struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
  1324. struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
  1325. opt_params_adam.print_forward_graph = false;
  1326. opt_params_adam.print_backward_graph = false;
  1327. opt_params_lbfgs.print_forward_graph = false;
  1328. opt_params_lbfgs.print_backward_graph = false;
  1329. opt_params_adam.adam.n_iter = 16;
  1330. opt_params_lbfgs.lbfgs.n_iter = 16;
  1331. // ggml_opt(ctx0, opt_params_adam, e);
  1332. ggml_opt(ctx0, opt_params_lbfgs, e);
  1333. //
  1334. ggml_build_forward_expand(&gf, e);
  1335. ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
  1336. float error_after_opt = ggml_get_f32_1d(e, 0);
  1337. if (ex % 8 == 0) {
  1338. printf("Example %d\n", (ex+1));
  1339. printf("error_before_opt: %.2f\n", error_before_opt);
  1340. printf("error_after_opt: %.2f\n", error_after_opt);
  1341. }
  1342. if (ex % 64 == 0) {
  1343. sample_softmax_batch(ctx0, logits, after_opt_probs, after_opt_best_samples);
  1344. // printf("probabilities after optimization:\n");
  1345. // print_matrix(after_opt_probs);
  1346. printf("best samples after optimization:\n");
  1347. print_tokens(after_opt_best_samples, n_vocab);
  1348. }
  1349. ggml_free(ctx0);
  1350. }
  1351. {
  1352. int n_gen = 128;
  1353. int sample_ctx = n_tokens-n_tokens/8;
  1354. printf("Generating %d tokens.\n", n_gen);
  1355. struct ggml_tensor * tokens_input = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens);
  1356. struct ggml_tensor * targets = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens);
  1357. get_example_targets(137, tokens_input, targets);
  1358. for (int i=sample_ctx; i<n_tokens; ++i) {
  1359. ggml_set_i32_1d(tokens_input, i, n_vocab/2);
  1360. }
  1361. for (int i=0; i<sample_ctx-1; ++i) {
  1362. print_token(ggml_get_i32_1d(tokens_input, i), n_vocab);
  1363. }
  1364. printf("---\n");
  1365. for (int i=0; i<n_gen; ++i) {
  1366. struct ggml_init_params params = {
  1367. /*.mem_size =*/ compute_size,
  1368. /*.mem_buffer =*/ compute_addr,
  1369. /*.no_alloc =*/ false,
  1370. };
  1371. struct ggml_context * ctx0 = ggml_init(params);
  1372. ggml_cgraph gf = {};
  1373. int n_past = 0;
  1374. struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
  1375. ggml_build_forward_expand(&gf, logits);
  1376. ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
  1377. struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
  1378. struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
  1379. sample_softmax(logits, probs, best_samples);
  1380. // int sample_at = n_tokens-1;
  1381. int token = ggml_get_i32_1d(best_samples, sample_ctx-1);
  1382. // print_row(probs, sample_at);
  1383. print_token(token, n_vocab);
  1384. lshift_examples(tokens_input, targets, 1);
  1385. ggml_set_i32_1d(tokens_input, 0, 0);
  1386. ggml_set_i32_1d(tokens_input, sample_ctx-1, token);
  1387. ggml_free(ctx0);
  1388. }
  1389. }
  1390. print_matrix(model.tok_embeddings);
  1391. printf("done\n");
  1392. // ggml_free(kv_self.ctx);
  1393. // ggml_free(model_lora.ctx);
  1394. ggml_free(model.ctx);
  1395. return 0;
  1396. }