baby-llama.cpp 64 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692
  1. #include "ggml.h"
  2. #include <vector>
  3. #include <cassert>
  4. #include <random>
  5. #include <cstring>
  6. float frand() {
  7. return (float)rand()/(float)RAND_MAX;
  8. }
  9. struct random_normal_distribution {
  10. std::mt19937 gen;
  11. std::normal_distribution<float> nd;
  12. float min;
  13. float max;
  14. };
  15. void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
  16. rnd->gen = std::mt19937(seed);
  17. rnd->nd = std::normal_distribution<float>{mean, std};
  18. rnd->min = min;
  19. rnd->max = max;
  20. }
  21. float frand_normal(struct random_normal_distribution * rnd) {
  22. const float r = rnd->nd(rnd->gen);
  23. return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
  24. }
  25. struct ggml_tensor * randomize_tensor(
  26. struct ggml_tensor * tensor,
  27. int ndims,
  28. const int64_t ne[],
  29. float fmin,
  30. float fmax) {
  31. switch (ndims) {
  32. case 1:
  33. for (int i0 = 0; i0 < ne[0]; i0++) {
  34. ((float *)tensor->data)[i0] = frand()*(fmax - fmin) + fmin;
  35. }
  36. break;
  37. case 2:
  38. for (int i1 = 0; i1 < ne[1]; i1++) {
  39. for (int i0 = 0; i0 < ne[0]; i0++) {
  40. ((float *)tensor->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  41. }
  42. }
  43. break;
  44. case 3:
  45. for (int i2 = 0; i2 < ne[2]; i2++) {
  46. for (int i1 = 0; i1 < ne[1]; i1++) {
  47. for (int i0 = 0; i0 < ne[0]; i0++) {
  48. ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  49. }
  50. }
  51. }
  52. break;
  53. case 4:
  54. for (int i3 = 0; i3 < ne[3]; i3++) {
  55. for (int i2 = 0; i2 < ne[2]; i2++) {
  56. for (int i1 = 0; i1 < ne[1]; i1++) {
  57. for (int i0 = 0; i0 < ne[0]; i0++) {
  58. ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  59. }
  60. }
  61. }
  62. }
  63. break;
  64. default:
  65. assert(false);
  66. };
  67. return tensor;
  68. }
  69. struct ggml_tensor * randomize_tensor_normal(
  70. struct ggml_tensor * tensor,
  71. int ndims,
  72. const int64_t ne[],
  73. struct random_normal_distribution * rnd) {
  74. float scale = 1.0; // xavier
  75. switch (ndims) {
  76. case 1:
  77. scale /= sqrtf(ne[0]);
  78. for (int i0 = 0; i0 < ne[0]; i0++) {
  79. ((float *)tensor->data)[i0] = scale * frand_normal(rnd);
  80. }
  81. break;
  82. case 2:
  83. scale /= sqrtf(ne[0]+ne[1]);
  84. for (int i1 = 0; i1 < ne[1]; i1++) {
  85. for (int i0 = 0; i0 < ne[0]; i0++) {
  86. ((float *)tensor->data)[i1*ne[0] + i0] = scale * frand_normal(rnd);
  87. }
  88. }
  89. break;
  90. case 3:
  91. scale /= sqrtf(ne[0]+ne[1]);
  92. for (int i2 = 0; i2 < ne[2]; i2++) {
  93. for (int i1 = 0; i1 < ne[1]; i1++) {
  94. for (int i0 = 0; i0 < ne[0]; i0++) {
  95. ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
  96. }
  97. }
  98. }
  99. break;
  100. case 4:
  101. scale /= sqrtf(ne[0]+ne[1]);
  102. for (int i3 = 0; i3 < ne[3]; i3++) {
  103. for (int i2 = 0; i2 < ne[2]; i2++) {
  104. for (int i1 = 0; i1 < ne[1]; i1++) {
  105. for (int i0 = 0; i0 < ne[0]; i0++) {
  106. ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
  107. }
  108. }
  109. }
  110. }
  111. break;
  112. default:
  113. assert(false);
  114. };
  115. return tensor;
  116. }
  117. struct llama_hparams {
  118. uint32_t n_vocab = 32000;
  119. uint32_t n_ctx = 512; // this is provided as user input?
  120. uint32_t n_embd = 4096;
  121. uint32_t n_mult = 4;
  122. uint32_t n_head = 32;
  123. uint32_t n_layer = 32;
  124. uint32_t n_rot = 64;
  125. bool operator!=(const llama_hparams & other) const {
  126. return memcmp(this, &other, sizeof(llama_hparams));
  127. }
  128. };
  129. uint32_t get_n_ff(const struct llama_hparams* hparams) {
  130. const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
  131. return n_ff;
  132. }
  133. struct llama_hparams_lora {
  134. uint32_t n_vocab = 32000;
  135. uint32_t n_ctx = 512; // this is provided as user input?
  136. uint32_t n_embd = 4096;
  137. uint32_t n_mult = 4;
  138. uint32_t n_head = 32;
  139. uint32_t n_layer = 32;
  140. uint32_t n_rot = 64;
  141. uint32_t n_lora = 64;
  142. bool operator!=(const llama_hparams_lora & other) const {
  143. return memcmp(this, &other, sizeof(llama_hparams_lora)) != 0;
  144. }
  145. };
  146. struct llama_layer {
  147. // normalization
  148. struct ggml_tensor * attention_norm;
  149. // attention
  150. struct ggml_tensor * wq;
  151. struct ggml_tensor * wk;
  152. struct ggml_tensor * wv;
  153. struct ggml_tensor * wo;
  154. // normalization
  155. struct ggml_tensor * ffn_norm;
  156. // ff
  157. struct ggml_tensor * w1;
  158. struct ggml_tensor * w2;
  159. struct ggml_tensor * w3;
  160. };
  161. struct llama_layer_lora {
  162. // normalization
  163. struct ggml_tensor * attention_norm;
  164. // attention
  165. struct ggml_tensor * wqa;
  166. struct ggml_tensor * wqb;
  167. struct ggml_tensor * wka;
  168. struct ggml_tensor * wkb;
  169. struct ggml_tensor * wva;
  170. struct ggml_tensor * wvb;
  171. struct ggml_tensor * woa;
  172. struct ggml_tensor * wob;
  173. // normalization
  174. struct ggml_tensor * ffn_norm;
  175. // ff
  176. struct ggml_tensor * w1;
  177. struct ggml_tensor * w2;
  178. struct ggml_tensor * w3;
  179. };
  180. struct llama_kv_cache {
  181. struct ggml_context * ctx = NULL;
  182. struct ggml_tensor * k;
  183. struct ggml_tensor * v;
  184. // llama_ctx_buffer buf;
  185. int n; // number of tokens currently in the cache
  186. };
  187. struct llama_model {
  188. struct ggml_context * ctx = NULL;
  189. llama_hparams hparams;
  190. struct ggml_tensor * tok_embeddings;
  191. struct ggml_tensor * norm;
  192. struct ggml_tensor * output;
  193. std::vector<llama_layer> layers;
  194. };
  195. struct llama_model_lora {
  196. struct ggml_context * ctx = NULL;
  197. llama_hparams_lora hparams;
  198. struct ggml_tensor * tok_embeddings;
  199. struct ggml_tensor * norm;
  200. struct ggml_tensor * outputa;
  201. struct ggml_tensor * outputb;
  202. std::vector<llama_layer_lora> layers;
  203. };
  204. void init_model(struct llama_model * model) {
  205. const auto & hparams = model->hparams;
  206. const uint32_t n_embd = hparams.n_embd;
  207. const uint32_t n_layer = hparams.n_layer;
  208. const uint32_t n_vocab = hparams.n_vocab;
  209. const uint32_t n_ff = get_n_ff(&hparams);
  210. struct ggml_context * ctx = model->ctx;
  211. model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab});
  212. model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // ("norm.weight", {n_embd});
  213. model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("output.weight", {n_embd, n_vocab});
  214. model->layers.resize(n_layer);
  215. for (uint32_t i = 0; i < n_layer; ++i) {
  216. auto & layer = model->layers[i];
  217. // std::string layers_i = "layers." + std::to_string(i);
  218. layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd});
  219. layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
  220. layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
  221. layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
  222. layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
  223. layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".ffn_norm.weight", {n_embd});
  224. layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
  225. layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); // (layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
  226. layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
  227. }
  228. }
  229. void init_model_lora(struct llama_model_lora * model) {
  230. const auto & hparams = model->hparams;
  231. const uint32_t n_embd = hparams.n_embd;
  232. const uint32_t n_mult = hparams.n_mult;
  233. const uint32_t n_layer = hparams.n_layer;
  234. const uint32_t n_vocab = hparams.n_vocab;
  235. const uint32_t n_lora = hparams.n_lora;
  236. const uint32_t n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult;
  237. struct ggml_context * ctx = model->ctx;
  238. model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab});
  239. model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // ("norm.weight", {n_embd});
  240. model->outputa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_vocab); // ("output.weight", {n_embd, n_vocab});
  241. model->outputb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // ("output.weight", {n_embd, n_vocab});
  242. model->layers.resize(n_layer);
  243. for (uint32_t i = 0; i < n_layer; ++i) {
  244. auto & layer = model->layers[i];
  245. // std::string layers_i = "layers." + std::to_string(i);
  246. layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd});
  247. layer.wqa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
  248. layer.wqb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
  249. layer.wka = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
  250. layer.wkb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
  251. layer.wva = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
  252. layer.wvb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
  253. layer.woa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd); // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
  254. layer.wob = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora); // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
  255. layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".ffn_norm.weight", {n_embd});
  256. layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
  257. layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); // (layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
  258. layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
  259. }
  260. }
  261. void set_param_model(struct llama_model * model) {
  262. const auto& hparams = model->hparams;
  263. const uint32_t n_layer = hparams.n_layer;
  264. struct ggml_context* ctx = model->ctx;
  265. ggml_set_param(ctx, model->tok_embeddings);
  266. ggml_set_param(ctx, model->norm);
  267. ggml_set_param(ctx, model->output);
  268. for (uint32_t i = 0; i < n_layer; ++i) {
  269. auto & layer = model->layers[i];
  270. ggml_set_param(ctx, layer.attention_norm);
  271. ggml_set_param(ctx, layer.wq);
  272. ggml_set_param(ctx, layer.wk);
  273. ggml_set_param(ctx, layer.wv);
  274. ggml_set_param(ctx, layer.wo);
  275. ggml_set_param(ctx, layer.ffn_norm);
  276. ggml_set_param(ctx, layer.w1);
  277. ggml_set_param(ctx, layer.w2);
  278. ggml_set_param(ctx, layer.w3);
  279. }
  280. }
  281. void set_param_model_lora(struct llama_model_lora * model) {
  282. const auto& hparams = model->hparams;
  283. const uint32_t n_layer = hparams.n_layer;
  284. struct ggml_context* ctx = model->ctx;
  285. ggml_set_param(ctx, model->tok_embeddings);
  286. ggml_set_param(ctx, model->norm);
  287. ggml_set_param(ctx, model->outputa);
  288. ggml_set_param(ctx, model->outputb);
  289. for (uint32_t i = 0; i < n_layer; ++i) {
  290. auto & layer = model->layers[i];
  291. ggml_set_param(ctx, layer.attention_norm);
  292. ggml_set_param(ctx, layer.wqa);
  293. ggml_set_param(ctx, layer.wqb);
  294. ggml_set_param(ctx, layer.wka);
  295. ggml_set_param(ctx, layer.wkb);
  296. ggml_set_param(ctx, layer.wva);
  297. ggml_set_param(ctx, layer.wvb);
  298. ggml_set_param(ctx, layer.woa);
  299. ggml_set_param(ctx, layer.wob);
  300. ggml_set_param(ctx, layer.ffn_norm);
  301. ggml_set_param(ctx, layer.w1);
  302. ggml_set_param(ctx, layer.w2);
  303. ggml_set_param(ctx, layer.w3);
  304. }
  305. }
  306. void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
  307. const auto & hparams = model->hparams;
  308. const uint32_t n_layer = hparams.n_layer;
  309. struct random_normal_distribution rnd;
  310. init_random_normal_distribution(&rnd, seed, mean, std, min, max);
  311. randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
  312. randomize_tensor_normal(model->norm, model->norm->n_dims, model->norm->ne, &rnd);
  313. randomize_tensor_normal(model->output, model->output->n_dims, model->output->ne, &rnd);
  314. for (uint32_t i = 0; i < n_layer; ++i) {
  315. auto & layer = model->layers[i];
  316. randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);
  317. randomize_tensor_normal(layer.wq, layer.wq->n_dims, layer.wq->ne, &rnd);
  318. randomize_tensor_normal(layer.wk, layer.wk->n_dims, layer.wk->ne, &rnd);
  319. randomize_tensor_normal(layer.wv, layer.wv->n_dims, layer.wv->ne, &rnd);
  320. randomize_tensor_normal(layer.wo, layer.wo->n_dims, layer.wo->ne, &rnd);
  321. randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);
  322. randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
  323. randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
  324. randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
  325. }
  326. }
  327. void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, float std, float min, float max) {
  328. const auto & hparams = model->hparams;
  329. const uint32_t n_layer = hparams.n_layer;
  330. struct random_normal_distribution rnd;
  331. init_random_normal_distribution(&rnd, seed, mean, std, min, max);
  332. randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
  333. randomize_tensor_normal(model->norm, model->norm->n_dims, model->norm->ne, &rnd);
  334. randomize_tensor_normal(model->outputa, model->outputa->n_dims, model->outputa->ne, &rnd);
  335. randomize_tensor_normal(model->outputb, model->outputb->n_dims, model->outputb->ne, &rnd);
  336. for (uint32_t i = 0; i < n_layer; ++i) {
  337. auto & layer = model->layers[i];
  338. randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);
  339. randomize_tensor_normal(layer.wqa, layer.wqa->n_dims, layer.wqa->ne, &rnd);
  340. randomize_tensor_normal(layer.wqb, layer.wqb->n_dims, layer.wqb->ne, &rnd);
  341. randomize_tensor_normal(layer.wka, layer.wka->n_dims, layer.wka->ne, &rnd);
  342. randomize_tensor_normal(layer.wkb, layer.wkb->n_dims, layer.wkb->ne, &rnd);
  343. randomize_tensor_normal(layer.wva, layer.wva->n_dims, layer.wva->ne, &rnd);
  344. randomize_tensor_normal(layer.wvb, layer.wvb->n_dims, layer.wvb->ne, &rnd);
  345. randomize_tensor_normal(layer.woa, layer.woa->n_dims, layer.woa->ne, &rnd);
  346. randomize_tensor_normal(layer.wob, layer.wob->n_dims, layer.wob->ne, &rnd);
  347. randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);
  348. randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
  349. randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
  350. randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
  351. }
  352. }
  353. bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
  354. const auto & hparams = model->hparams;
  355. const uint32_t n_ctx = hparams.n_ctx;
  356. const uint32_t n_embd = hparams.n_embd;
  357. const uint32_t n_layer = hparams.n_layer;
  358. const int64_t n_mem = n_layer*n_ctx*n_batch;
  359. const int64_t n_elements = n_embd*n_mem;
  360. // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
  361. // struct ggml_init_params params;
  362. // params.mem_size = cache.buf.size;
  363. // params.mem_buffer = cache.buf.addr;
  364. // params.no_alloc = false;
  365. if (!cache->ctx) {
  366. struct ggml_init_params params;
  367. params.mem_size = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
  368. params.mem_buffer = NULL;
  369. params.no_alloc = false;
  370. cache->ctx = ggml_init(params);
  371. if (!cache->ctx) {
  372. fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
  373. return false;
  374. }
  375. }
  376. cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
  377. cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
  378. return true;
  379. }
  380. bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
  381. const auto & hparams = model->hparams;
  382. const uint32_t n_ctx = hparams.n_ctx;
  383. const uint32_t n_embd = hparams.n_embd;
  384. const uint32_t n_layer = hparams.n_layer;
  385. const int64_t n_mem = n_layer*n_ctx*n_batch;
  386. const int64_t n_elements = n_embd*n_mem;
  387. // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
  388. // struct ggml_init_params params;
  389. // params.mem_size = cache.buf.size;
  390. // params.mem_buffer = cache.buf.addr;
  391. // params.no_alloc = false;
  392. if (!cache->ctx) {
  393. struct ggml_init_params params;
  394. params.mem_size = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
  395. params.mem_buffer = NULL;
  396. params.no_alloc = false;
  397. cache->ctx = ggml_init(params);
  398. if (!cache->ctx) {
  399. fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
  400. return false;
  401. }
  402. }
  403. cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
  404. cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
  405. return true;
  406. }
  407. struct ggml_tensor * forward(
  408. struct llama_model * model,
  409. struct llama_kv_cache * cache,
  410. struct ggml_context * ctx0,
  411. struct ggml_cgraph * gf,
  412. struct ggml_tensor * tokens_input,
  413. const int n_tokens,
  414. const int n_past) {
  415. const int N = n_tokens;
  416. struct llama_kv_cache& kv_self = *cache;
  417. const auto & hparams = model->hparams;
  418. const int n_ctx = hparams.n_ctx;
  419. const int n_embd = hparams.n_embd;
  420. const int n_layer = hparams.n_layer;
  421. const int n_head = hparams.n_head;
  422. const int n_rot = hparams.n_rot;
  423. struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
  424. memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
  425. struct ggml_tensor * kc = kv_self.k;
  426. struct ggml_tensor * vc = kv_self.v;
  427. // inpL shape [n_embd,N,1,1]
  428. struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
  429. for (int il = 0; il < n_layer; ++il) {
  430. struct ggml_tensor * inpSA = inpL;
  431. struct ggml_tensor * cur;
  432. // lctx.use_buf(ctx0, 0);
  433. // norm
  434. {
  435. // cur shape [n_embd,N,1,1]
  436. cur = ggml_rms_norm(ctx0, inpL);
  437. // cur = attention_norm*cur
  438. cur = ggml_mul(ctx0,
  439. ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
  440. cur);
  441. }
  442. // self-attention
  443. {
  444. // compute Q and K and RoPE them
  445. // wq shape [n_embd, n_embd, 1, 1]
  446. // wk shape [n_embd, n_embd, 1, 1]
  447. // Qcur shape [n_embd/n_head, n_head, N, 1]
  448. // Kcur shape [n_embd/n_head, n_head, N, 1]
  449. struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
  450. struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
  451. // store key and value to memory
  452. {
  453. // compute the transposed [N, n_embd] V matrix
  454. // wv shape [n_embd, n_embd, 1, 1]
  455. // Vcur shape [n_embd, N, 1, 1]
  456. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N)));
  457. // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
  458. // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
  459. // k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0]
  460. // v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
  461. /* {
  462. struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
  463. struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
  464. ( n_ctx)*ggml_element_size(kv_self.v),
  465. (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
  466. // important: storing RoPE-ed version of K in the KV cache!
  467. ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
  468. ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
  469. } //*/
  470. kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
  471. vc = ggml_set_2d(ctx0, vc, Vcur, ( n_ctx)*ggml_element_size(kv_self.v),
  472. (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
  473. }
  474. // Qcur shape [n_embd/n_head, n_head, N, 1]
  475. // Q shape [n_embd/n_head, N, n_head, 1]
  476. struct ggml_tensor * Q =
  477. ggml_permute(ctx0,
  478. Qcur,
  479. 0, 2, 1, 3);
  480. // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
  481. // K shape [n_embd/n_head, n_past + N, n_head, 1]
  482. struct ggml_tensor * K =
  483. ggml_permute(ctx0,
  484. ggml_reshape_3d(ctx0,
  485. ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd),
  486. n_embd/n_head, n_head, n_past + N),
  487. 0, 2, 1, 3);
  488. // K * Q
  489. // KQ shape [n_past + N, N, n_head, 1]
  490. struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
  491. // KQ_scaled = KQ / sqrt(n_embd/n_head)
  492. // KQ_scaled shape [n_past + N, N, n_head, 1]
  493. struct ggml_tensor * KQ_scaled =
  494. ggml_scale(ctx0,
  495. KQ,
  496. ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
  497. // KQ_masked = mask_past(KQ_scaled)
  498. // KQ_masked shape [n_past + N, N, n_head, 1]
  499. struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
  500. // KQ = soft_max(KQ_masked)
  501. // KQ_soft_max shape [n_past + N, N, n_head, 1]
  502. struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
  503. // split cached V into n_head heads
  504. //// V shape [n_past + N, n_embd/n_head, n_head, 1]
  505. // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1]
  506. struct ggml_tensor * V =
  507. ggml_view_3d(ctx0, vc,
  508. n_past + N, n_embd/n_head, n_head,
  509. n_ctx*ggml_element_size(vc),
  510. n_ctx*ggml_element_size(vc)*n_embd/n_head,
  511. il*n_ctx*ggml_element_size(vc)*n_embd);
  512. // KQV shape [n_embd/n_head, N, n_head, 1]
  513. struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
  514. // KQV_merged = KQV.permute(0, 2, 1, 3)
  515. // KQV_merged shape [n_embd/n_head, n_head, N, 1]
  516. struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
  517. // KQV_merged shape
  518. // cur = KQV_merged.contiguous().view(n_embd, N)
  519. // cur shape [n_embd,N,1,1]
  520. cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N);
  521. // cur = ggml_cpy(ctx0,
  522. // KQV_merged,
  523. // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
  524. // projection (no bias)
  525. // cur shape [n_embd,N,1,1]
  526. cur = ggml_mul_mat(ctx0,
  527. model->layers[il].wo,
  528. cur);
  529. }
  530. // lctx.use_buf(ctx0, 1);
  531. // inpFF shape [n_embd,N,1,1]
  532. struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
  533. // feed-forward network
  534. {
  535. // norm
  536. {
  537. // cur shape [n_embd,N,1,1]
  538. cur = ggml_rms_norm(ctx0, inpFF);
  539. // cur = ffn_norm*cur
  540. // cur shape [n_embd,N,1,1]
  541. cur = ggml_mul(ctx0,
  542. ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
  543. cur);
  544. }
  545. // tmp shape [n_ff,N,1,1]
  546. struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
  547. model->layers[il].w3,
  548. cur);
  549. // cur shape [n_ff,N,1,1]
  550. cur = ggml_mul_mat(ctx0,
  551. model->layers[il].w1,
  552. cur);
  553. // SILU activation
  554. // cur shape [n_ff,N,1,1]
  555. cur = ggml_silu(ctx0, cur);
  556. // cur shape [n_ff,N,1,1]
  557. cur = ggml_mul(ctx0, cur, tmp);
  558. // cur shape [n_embd,N,1,1]
  559. cur = ggml_mul_mat(ctx0,
  560. model->layers[il].w2,
  561. cur);
  562. }
  563. // cur shape [n_embd,N,1,1]
  564. cur = ggml_add(ctx0, cur, inpFF);
  565. // input for next layer
  566. // inpL shape [n_embd,N,1,1]
  567. inpL = cur;
  568. }
  569. // norm
  570. {
  571. // inpL shape [n_embd,N,1,1]
  572. inpL = ggml_rms_norm(ctx0, inpL);
  573. // inpL = norm*inpL
  574. // inpL shape [n_embd,N,1,1]
  575. inpL = ggml_mul(ctx0,
  576. ggml_repeat(ctx0, model->norm, inpL),
  577. inpL);
  578. //embeddings = inpL;
  579. }
  580. // lm_head
  581. // inpL shape [n_vocab,N,1,1]
  582. inpL = ggml_mul_mat(ctx0, model->output, inpL);
  583. // run the computation
  584. ggml_build_forward_expand(gf, inpL);
  585. return inpL;
  586. }
  587. void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
  588. GGML_ASSERT(tensor->n_dims == 1);
  589. GGML_ASSERT(tensor->ne[0] == ne0);
  590. }
  591. void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
  592. GGML_ASSERT(tensor->n_dims == 2);
  593. GGML_ASSERT(tensor->ne[0] == ne0);
  594. GGML_ASSERT(tensor->ne[1] == ne1);
  595. }
  596. void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
  597. GGML_ASSERT(tensor->n_dims == 3);
  598. GGML_ASSERT(tensor->ne[0] == ne0);
  599. GGML_ASSERT(tensor->ne[1] == ne1);
  600. GGML_ASSERT(tensor->ne[2] == ne2);
  601. }
  602. void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
  603. GGML_ASSERT(tensor->n_dims == 4);
  604. GGML_ASSERT(tensor->ne[0] == ne0);
  605. GGML_ASSERT(tensor->ne[1] == ne1);
  606. GGML_ASSERT(tensor->ne[2] == ne2);
  607. GGML_ASSERT(tensor->ne[3] == ne3);
  608. }
  609. struct ggml_tensor * forward_batch(
  610. struct llama_model * model,
  611. struct llama_kv_cache * cache,
  612. struct ggml_context * ctx0,
  613. struct ggml_cgraph * gf,
  614. struct ggml_tensor * tokens_input,
  615. const int n_tokens,
  616. const int n_past,
  617. const int n_batch) {
  618. const int N = n_tokens;
  619. struct llama_kv_cache& kv_self = *cache;
  620. const auto & hparams = model->hparams;
  621. const int n_ctx = hparams.n_ctx;
  622. const int n_vocab = hparams.n_vocab;
  623. const int n_embd = hparams.n_embd;
  624. const int n_layer = hparams.n_layer;
  625. const int n_head = hparams.n_head;
  626. const int n_rot = hparams.n_rot;
  627. const int n_ff = get_n_ff(&hparams);
  628. struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch);
  629. memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch);
  630. struct ggml_tensor * kc = kv_self.k;
  631. struct ggml_tensor * vc = kv_self.v;
  632. // inpL shape [n_embd,N*n_batch,1]
  633. struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
  634. assert_shape_2d(inpL, n_embd, N*n_batch);
  635. for (int il = 0; il < n_layer; ++il) {
  636. struct ggml_tensor * inpSA = inpL;
  637. struct ggml_tensor * cur;
  638. // lctx.use_buf(ctx0, 0);
  639. // norm
  640. {
  641. // cur shape [n_embd,N*n_batch,1,1]
  642. cur = ggml_rms_norm(ctx0, inpL);
  643. assert_shape_2d(cur, n_embd, N*n_batch);
  644. // cur = attention_norm*cur
  645. cur = ggml_mul(ctx0,
  646. ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
  647. cur);
  648. assert_shape_2d(cur, n_embd, N*n_batch);
  649. }
  650. // self-attention
  651. {
  652. // compute Q and K and RoPE them
  653. // wq shape [n_embd, n_embd, 1, 1]
  654. // wk shape [n_embd, n_embd, 1, 1]
  655. // Qcur shape [n_embd/n_head, n_head, N, n_batch]
  656. // Kcur shape [n_embd/n_head, n_head, N, n_batch]
  657. struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
  658. struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
  659. assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
  660. assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
  661. // store key and value to memory
  662. {
  663. // compute the transposed [N, n_embd] V matrix
  664. // wv shape [n_embd, n_embd, 1, 1]
  665. // Vcur shape [N, n_embd, n_batch, 1]
  666. struct ggml_tensor * Vcur = ggml_cont(ctx0,
  667. ggml_permute(ctx0,
  668. ggml_reshape_3d(ctx0,
  669. ggml_mul_mat(ctx0,
  670. model->layers[il].wv,
  671. cur),
  672. n_embd, N, n_batch),
  673. 1, 0, 2, 3));
  674. assert_shape_3d(Vcur, N, n_embd, n_batch);
  675. // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
  676. // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
  677. // k shape [n_embd * N, n_batch] == kv_self.k[:,n_past:n_past+N,:,il]
  678. // v shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il]
  679. /* {
  680. struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
  681. struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
  682. ( n_ctx)*ggml_element_size(kv_self.v),
  683. (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
  684. // important: storing RoPE-ed version of K in the KV cache!
  685. ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
  686. ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
  687. } //*/
  688. kc = ggml_set_2d(ctx0, kc,
  689. ggml_reshape_2d(ctx0, Kcur, n_embd*N, n_batch),
  690. ggml_element_size(kc)*n_embd*n_ctx,
  691. (ggml_element_size(kc)*n_embd)*(il*n_batch*n_ctx + n_past));
  692. vc = ggml_set_2d(ctx0, vc,
  693. ggml_reshape_2d(ctx0, Vcur, N*n_embd, n_batch),
  694. ggml_element_size(vc)*n_ctx*n_embd,
  695. ggml_element_size(vc)*(n_past + il*n_embd*n_batch*n_ctx));
  696. assert_shape_1d(kc, n_embd * n_ctx * n_batch * n_layer);
  697. assert_shape_1d(vc, n_embd * n_ctx * n_batch * n_layer);
  698. }
  699. // Qcur shape [n_embd/n_head, n_head, N, n_batch]
  700. // Q shape [n_embd/n_head, N, n_head, n_batch]
  701. struct ggml_tensor * Q =
  702. ggml_permute(ctx0,
  703. Qcur,
  704. 0, 2, 1, 3);
  705. assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
  706. // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
  707. // K shape [n_embd/n_head, n_past + N, n_head, n_batch]
  708. struct ggml_tensor * K =
  709. ggml_permute(ctx0,
  710. ggml_reshape_4d(ctx0,
  711. ggml_view_3d(ctx0,
  712. kc,
  713. n_embd,
  714. (n_past + N),
  715. n_batch,
  716. n_embd*ggml_element_size(kc),
  717. n_ctx*n_embd*ggml_element_size(kc),
  718. il*n_batch*n_ctx*n_embd*ggml_element_size(kc)),
  719. n_embd/n_head, n_head, n_past + N, n_batch),
  720. 0, 2, 1, 3);
  721. assert_shape_4d(K, n_embd/n_head, n_past + N, n_head, n_batch);
  722. // K * Q
  723. // KQ shape [n_past + N, N, n_head, n_batch]
  724. struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
  725. assert_shape_4d(KQ, n_past + N, N, n_head, n_batch);
  726. // KQ_scaled = KQ / sqrt(n_embd/n_head)
  727. // KQ_scaled shape [n_past + N, N, n_head, n_batch]
  728. struct ggml_tensor * KQ_scaled =
  729. ggml_scale(ctx0,
  730. KQ,
  731. ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
  732. assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch);
  733. // KQ_masked = mask_past(KQ_scaled)
  734. // KQ_masked shape [n_past + N, N, n_head, n_batch]
  735. struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
  736. assert_shape_4d(KQ_masked, n_past + N, N, n_head, n_batch);
  737. // KQ = soft_max(KQ_masked)
  738. // KQ_soft_max shape [n_past + N, N, n_head, n_batch]
  739. struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
  740. assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch);
  741. // split cached V into n_head heads
  742. // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
  743. // V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il]
  744. struct ggml_tensor * V =
  745. ggml_view_4d(ctx0, vc,
  746. n_past + N, n_embd/n_head, n_head, n_batch,
  747. ggml_element_size(vc)*n_ctx,
  748. ggml_element_size(vc)*n_ctx*n_embd/n_head,
  749. ggml_element_size(vc)*n_ctx*n_embd,
  750. il*n_batch*n_ctx*n_embd*ggml_element_size(vc));
  751. assert_shape_4d(V, n_past + N, n_embd/n_head, n_head, n_batch);
  752. // KQV shape [n_embd/n_head, N, n_head, n_batch]
  753. struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
  754. assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch);
  755. // KQV_merged = KQV.permute(0, 2, 1, 3)
  756. // KQV_merged shape [n_embd/n_head, n_head, N, n_batch]
  757. struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
  758. assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch);
  759. // KQV_merged shape
  760. // cur = KQV_merged.contiguous().view(n_embd, N)
  761. // cur shape [n_embd,N*n_batch,1,1]
  762. cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch);
  763. assert_shape_2d(cur, n_embd, N*n_batch);
  764. // cur = ggml_cpy(ctx0,
  765. // KQV_merged,
  766. // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
  767. // projection (no bias)
  768. // cur shape [n_embd,N*n_batch,1,1]
  769. cur = ggml_mul_mat(ctx0,
  770. model->layers[il].wo,
  771. cur);
  772. assert_shape_2d(cur, n_embd, N*n_batch);
  773. }
  774. // lctx.use_buf(ctx0, 1);
  775. // inpFF shape [n_embd,N*n_batch,1,1]
  776. struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
  777. assert_shape_2d(inpFF, n_embd, N*n_batch);
  778. // feed-forward network
  779. {
  780. // norm
  781. {
  782. // cur shape [n_embd,N*n_batch,1,1]
  783. cur = ggml_rms_norm(ctx0, inpFF);
  784. assert_shape_2d(cur, n_embd, N*n_batch);
  785. // cur = ffn_norm*cur
  786. // cur shape [n_embd,N*n_batch,1,1]
  787. cur = ggml_mul(ctx0,
  788. ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
  789. cur);
  790. assert_shape_2d(cur, n_embd, N*n_batch);
  791. }
  792. // tmp shape [n_ff,N*n_batch,1,1]
  793. struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
  794. model->layers[il].w3,
  795. cur);
  796. assert_shape_2d(tmp, n_ff, N*n_batch);
  797. // cur shape [n_ff,N*n_batch,1,1]
  798. cur = ggml_mul_mat(ctx0,
  799. model->layers[il].w1,
  800. cur);
  801. assert_shape_2d(cur, n_ff, N*n_batch);
  802. // SILU activation
  803. // cur shape [n_ff,N*n_batch,1,1]
  804. cur = ggml_silu(ctx0, cur);
  805. assert_shape_2d(cur, n_ff, N*n_batch);
  806. // cur shape [n_ff,N*n_batch,1,1]
  807. cur = ggml_mul(ctx0, cur, tmp);
  808. assert_shape_2d(cur, n_ff, N*n_batch);
  809. // cur shape [n_embd,N*n_batch,1,1]
  810. cur = ggml_mul_mat(ctx0,
  811. model->layers[il].w2,
  812. cur);
  813. assert_shape_2d(cur, n_embd, N*n_batch);
  814. }
  815. // cur shape [n_embd,N*n_batch,1,1]
  816. cur = ggml_add(ctx0, cur, inpFF);
  817. assert_shape_2d(cur, n_embd, N*n_batch);
  818. // input for next layer
  819. // inpL shape [n_embd,N*n_batch,1,1]
  820. inpL = cur;
  821. assert_shape_2d(inpL, n_embd, N*n_batch);
  822. }
  823. // norm
  824. {
  825. // inpL shape [n_embd,N*n_batch,1,1]
  826. inpL = ggml_rms_norm(ctx0, inpL);
  827. assert_shape_2d(inpL, n_embd, N*n_batch);
  828. // inpL = norm*inpL
  829. // inpL shape [n_embd,N*n_batch,1,1]
  830. inpL = ggml_mul(ctx0,
  831. ggml_repeat(ctx0, model->norm, inpL),
  832. inpL);
  833. assert_shape_2d(inpL, n_embd, N*n_batch);
  834. //embeddings = inpL;
  835. }
  836. // lm_head
  837. // inpL shape [n_vocab,N*n_batch,1,1]
  838. inpL = ggml_mul_mat(ctx0, model->output, inpL);
  839. assert_shape_2d(inpL, n_vocab, N*n_batch);
  840. {
  841. // inpL shape [n_vocab,N,n_batch,1]
  842. inpL = ggml_reshape_3d(ctx0,
  843. inpL,
  844. n_vocab, N, n_batch);
  845. assert_shape_3d(inpL, n_vocab, N, n_batch);
  846. }
  847. // run the computation
  848. ggml_build_forward_expand(gf, inpL);
  849. return inpL;
  850. }
  851. struct ggml_tensor * forward_lora(
  852. struct llama_model_lora * model,
  853. struct llama_kv_cache * cache,
  854. struct ggml_context * ctx0,
  855. struct ggml_cgraph * gf,
  856. struct ggml_tensor * tokens_input,
  857. const int n_tokens,
  858. const int n_past) {
  859. const int N = n_tokens;
  860. struct llama_kv_cache& kv_self = *cache;
  861. const auto & hparams = model->hparams;
  862. const int n_ctx = hparams.n_ctx;
  863. const int n_embd = hparams.n_embd;
  864. const int n_layer = hparams.n_layer;
  865. const int n_head = hparams.n_head;
  866. const int n_rot = hparams.n_rot;
  867. struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
  868. memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
  869. struct ggml_tensor * kc = kv_self.k;
  870. struct ggml_tensor * vc = kv_self.v;
  871. // inpL shape [n_embd,N,1,1]
  872. struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
  873. for (int il = 0; il < n_layer; ++il) {
  874. struct ggml_tensor * inpSA = inpL;
  875. struct ggml_tensor * cur;
  876. // norm
  877. {
  878. // cur shape [n_embd,N,1,1]
  879. cur = ggml_rms_norm(ctx0, inpL);
  880. // cur = attention_norm*cur
  881. cur = ggml_mul(ctx0,
  882. ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
  883. cur);
  884. }
  885. // self-attention
  886. {
  887. // compute Q and K and RoPE them
  888. // wq shape [n_embd, n_embd, 1, 1]
  889. // wk shape [n_embd, n_embd, 1, 1]
  890. // Qcur shape [n_embd/n_head, n_head, N, 1]
  891. // Kcur shape [n_embd/n_head, n_head, N, 1]
  892. struct ggml_tensor * Qcur = ggml_rope(ctx0,
  893. ggml_reshape_3d(ctx0,
  894. ggml_mul_mat(ctx0,
  895. model->layers[il].wqa,
  896. ggml_mul_mat(ctx0,
  897. model->layers[il].wqb,
  898. cur)),
  899. n_embd/n_head, n_head, N),
  900. n_past, n_rot, 0);
  901. struct ggml_tensor * Kcur = ggml_rope(ctx0,
  902. ggml_reshape_3d(ctx0,
  903. ggml_mul_mat(ctx0,
  904. model->layers[il].wka,
  905. ggml_mul_mat(ctx0,
  906. model->layers[il].wkb,
  907. cur)),
  908. n_embd/n_head, n_head, N),
  909. n_past, n_rot, 0);
  910. // store key and value to memory
  911. {
  912. // compute the transposed [N, n_embd] V matrix
  913. // wv shape [n_embd, n_embd, 1, 1]
  914. // Vcur shape [n_embd, N, 1, 1]
  915. struct ggml_tensor * Vcur = ggml_cont(ctx0,
  916. ggml_transpose(ctx0,
  917. ggml_reshape_2d(ctx0,
  918. ggml_mul_mat(ctx0,
  919. model->layers[il].wva,
  920. ggml_mul_mat(ctx0,
  921. model->layers[il].wvb,
  922. cur)),
  923. n_embd, N)));
  924. // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
  925. // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
  926. // k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0]
  927. // v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
  928. /* {
  929. struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
  930. struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
  931. ( n_ctx)*ggml_element_size(kv_self.v),
  932. (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
  933. // important: storing RoPE-ed version of K in the KV cache!
  934. ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
  935. ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
  936. } //*/
  937. kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
  938. vc = ggml_set_2d(ctx0, vc, Vcur, ( n_ctx)*ggml_element_size(kv_self.v),
  939. (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
  940. }
  941. // Qcur shape [n_embd/n_head, n_head, N, 1]
  942. // Q shape [n_embd/n_head, N, n_head, 1]
  943. struct ggml_tensor * Q =
  944. ggml_permute(ctx0,
  945. Qcur,
  946. 0, 2, 1, 3);
  947. // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
  948. // K shape [n_embd/n_head, n_past + N, n_head, 1]
  949. struct ggml_tensor * K =
  950. ggml_permute(ctx0,
  951. ggml_reshape_3d(ctx0,
  952. ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd),
  953. n_embd/n_head, n_head, n_past + N),
  954. 0, 2, 1, 3);
  955. // K * Q
  956. // KQ shape [n_past + N, N, n_head, 1]
  957. struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
  958. // KQ_scaled = KQ / sqrt(n_embd/n_head)
  959. // KQ_scaled shape [n_past + N, N, n_head, 1]
  960. struct ggml_tensor * KQ_scaled =
  961. ggml_scale(ctx0,
  962. KQ,
  963. ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
  964. // KQ_masked = mask_past(KQ_scaled)
  965. // KQ_masked shape [n_past + N, N, n_head, 1]
  966. struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
  967. // KQ = soft_max(KQ_masked)
  968. // KQ_soft_max shape [n_past + N, N, n_head, 1]
  969. struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
  970. // split cached V into n_head heads
  971. //// V shape [n_past + N, n_embd/n_head, n_head, 1]
  972. // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1]
  973. struct ggml_tensor * V =
  974. ggml_view_3d(ctx0, vc,
  975. n_past + N, n_embd/n_head, n_head,
  976. n_ctx*ggml_element_size(vc),
  977. n_ctx*ggml_element_size(vc)*n_embd/n_head,
  978. il*n_ctx*ggml_element_size(vc)*n_embd);
  979. // KQV shape [n_embd/n_head, N, n_head, 1]
  980. struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
  981. // KQV_merged = KQV.permute(0, 2, 1, 3)
  982. // KQV_merged shape [n_embd/n_head, n_head, N, 1]
  983. struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
  984. // KQV_merged shape
  985. // cur = KQV_merged.contiguous().view(n_embd, N)
  986. // cur shape [n_embd,N,1,1]
  987. cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N);
  988. // cur = ggml_cpy(ctx0,
  989. // KQV_merged,
  990. // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
  991. // projection (no bias)
  992. // cur shape [n_embd,N,1,1]
  993. cur = ggml_mul_mat(ctx0,
  994. model->layers[il].woa,
  995. ggml_mul_mat(ctx0,
  996. model->layers[il].wob,
  997. cur));
  998. }
  999. // inpFF shape [n_embd,N,1,1]
  1000. struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
  1001. // feed-forward network
  1002. {
  1003. // norm
  1004. {
  1005. // cur shape [n_embd,N,1,1]
  1006. cur = ggml_rms_norm(ctx0, inpFF);
  1007. // cur = ffn_norm*cur
  1008. // cur shape [n_embd,N,1,1]
  1009. cur = ggml_mul(ctx0,
  1010. ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
  1011. cur);
  1012. }
  1013. // tmp shape [n_ff,N,1,1]
  1014. struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
  1015. model->layers[il].w3,
  1016. cur);
  1017. // cur shape [n_ff,N,1,1]
  1018. cur = ggml_mul_mat(ctx0,
  1019. model->layers[il].w1,
  1020. cur);
  1021. // SILU activation
  1022. // cur shape [n_ff,N,1,1]
  1023. cur = ggml_silu(ctx0, cur);
  1024. // cur shape [n_ff,N,1,1]
  1025. cur = ggml_mul(ctx0, cur, tmp);
  1026. // cur shape [n_embd,N,1,1]
  1027. cur = ggml_mul_mat(ctx0,
  1028. model->layers[il].w2,
  1029. cur);
  1030. }
  1031. // cur shape [n_embd,N,1,1]
  1032. cur = ggml_add(ctx0, cur, inpFF);
  1033. // input for next layer
  1034. // inpL shape [n_embd,N,1,1]
  1035. inpL = cur;
  1036. }
  1037. // norm
  1038. {
  1039. // inpL shape [n_embd,N,1,1]
  1040. inpL = ggml_rms_norm(ctx0, inpL);
  1041. // inpL = norm*inpL
  1042. // inpL shape [n_embd,N,1,1]
  1043. inpL = ggml_mul(ctx0,
  1044. ggml_repeat(ctx0, model->norm, inpL),
  1045. inpL);
  1046. //embeddings = inpL;
  1047. }
  1048. // lm_head
  1049. // inpL shape [n_vocab,N,1,1]
  1050. inpL = ggml_mul_mat(ctx0,
  1051. model->outputa,
  1052. ggml_mul_mat(ctx0,
  1053. model->outputb,
  1054. inpL));
  1055. // ggml_set_scratch(ctx0, { 0, 0, nullptr, });
  1056. // run the computation
  1057. ggml_build_forward_expand(gf, inpL);
  1058. return inpL;
  1059. }
  1060. void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
  1061. assert(logits->n_dims == 2);
  1062. assert(probs->n_dims == 2);
  1063. assert(best_samples->n_dims == 1);
  1064. assert(logits->ne[1] == best_samples->ne[0]);
  1065. assert(logits->ne[0] == probs->ne[0]);
  1066. assert(logits->ne[1] == probs->ne[1]);
  1067. for (int i = 0; i < logits->ne[1]; ++i) {
  1068. float max_logit = ggml_get_f32_1d(logits, i * logits->ne[0]);
  1069. ggml_set_i32_1d(best_samples, i, 0);
  1070. for (int k = 0; k < logits->ne[0]; ++k) {
  1071. float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k);
  1072. if (logit > max_logit) {
  1073. max_logit = logit;
  1074. ggml_set_i32_1d(best_samples, i, k);
  1075. }
  1076. }
  1077. float psum = 0;
  1078. for (int k = 0; k < logits->ne[0]; ++k) {
  1079. float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k);
  1080. float p = (logit == -INFINITY) ? 0 : expf(logit - max_logit);
  1081. psum += p;
  1082. ggml_set_f32_1d(probs, i * probs->ne[0] + k, p);
  1083. }
  1084. for (int k = 0; k < logits->ne[0]; ++k) {
  1085. float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
  1086. ggml_set_f32_1d(probs, i * probs->ne[0] + k, p / psum);
  1087. }
  1088. }
  1089. }
  1090. void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
  1091. GGML_ASSERT(best_samples->n_dims == 2);
  1092. GGML_ASSERT(logits->n_dims == 3);
  1093. GGML_ASSERT(probs->n_dims == 3);
  1094. int n_tokens = best_samples->ne[0];
  1095. int n_batch = best_samples->ne[1];
  1096. int n_vocab = logits->ne[0];
  1097. GGML_ASSERT(n_tokens == logits->ne[1]);
  1098. GGML_ASSERT(n_batch == logits->ne[2]);
  1099. GGML_ASSERT(n_vocab == probs->ne[0]);
  1100. GGML_ASSERT(n_tokens == probs->ne[1]);
  1101. GGML_ASSERT(n_batch == probs->ne[2]);
  1102. for (int k = 0; k < n_batch; ++k) {
  1103. struct ggml_tensor * best_samples_k = ggml_view_1d(ctx,
  1104. best_samples,
  1105. best_samples->ne[0],
  1106. k*best_samples->nb[1]);
  1107. struct ggml_tensor * logits_k = ggml_view_2d(ctx,
  1108. logits,
  1109. logits->ne[0],
  1110. logits->ne[1],
  1111. logits->nb[1],
  1112. k*logits->nb[2]);
  1113. struct ggml_tensor * probs_k = ggml_view_2d(ctx,
  1114. probs,
  1115. probs->ne[0],
  1116. probs->ne[1],
  1117. probs->nb[1],
  1118. k*probs->nb[2]);
  1119. sample_softmax(logits_k, probs_k, best_samples_k);
  1120. }
  1121. }
  1122. void print_row(struct ggml_tensor * probs, int i) {
  1123. for (int k = 0; k < probs->ne[0]; ++k) {
  1124. float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
  1125. printf(" %.2f", p);
  1126. }
  1127. printf("\n");
  1128. }
  1129. void print_matrix(struct ggml_tensor * probs) {
  1130. assert(probs->n_dims == 2);
  1131. for (int i = 0; i < probs->ne[1]; ++i) {
  1132. for (int k = 0; k < probs->ne[0]; ++k) {
  1133. float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
  1134. printf(" %.2f", p);
  1135. }
  1136. printf("\n");
  1137. }
  1138. }
  1139. void print_token(int token, int n_vocab) {
  1140. for (int k = 0; k < token; ++k) {
  1141. printf(" ");
  1142. }
  1143. printf("X");
  1144. for (int k = token+1; k < n_vocab; ++k) {
  1145. printf(" ");
  1146. }
  1147. printf("\n");
  1148. }
  1149. void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
  1150. for (int i=0; i<tokens->ne[0]; ++i) {
  1151. int token = ggml_get_i32_1d(tokens, i);
  1152. print_token(token, n_vocab);
  1153. }
  1154. }
  1155. void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
  1156. int n_tokens = tokens_input->ne[0];
  1157. int n_vocab = targets->ne[0];
  1158. float randomness = 0.0f;
  1159. // ggml_set_zero(targets);
  1160. ggml_set_f32(targets, -1.0f);
  1161. ggml_set_i32_1d(tokens_input, 0, 0);
  1162. for (int i=1; i<n_tokens+1; ++i) {
  1163. float x = example_id + i * 3.14159f * 2.0f * 1.0f * 0.5f / n_tokens;
  1164. float y = sinf(x);//*cosf(x*1.1f+1.0f);
  1165. float z = (y+1.0f)*0.5f; // scale to [0..1]
  1166. z += (frand()-0.5f)*(randomness/n_vocab);
  1167. z = (z < 0.0f) ? 0.0f : (z > 1.0f) ? 1.0f : z; // clamp to [0..1]
  1168. int token = std::max(1,std::min(1+(int)(z*(float)(n_vocab-1)), n_vocab-1));
  1169. ggml_set_f32_1d(targets, (i-1)*n_vocab + token, +1.0f);
  1170. if (i<n_tokens) {
  1171. ggml_set_i32_1d(tokens_input, i, token);
  1172. }
  1173. }
  1174. }
  1175. void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
  1176. GGML_ASSERT(tokens_input->n_dims == 2);
  1177. GGML_ASSERT( targets->n_dims == 3);
  1178. int n_tokens = tokens_input->ne[0];
  1179. int n_batch = tokens_input->ne[1];
  1180. GGML_ASSERT(n_tokens == targets->ne[1]);
  1181. GGML_ASSERT(n_batch == targets->ne[2]);
  1182. for (int k=0; k<n_batch; ++k) {
  1183. struct ggml_tensor * tokens_input_k = ggml_view_1d(ctx,
  1184. tokens_input,
  1185. tokens_input->ne[0],
  1186. k*tokens_input->nb[1]);
  1187. struct ggml_tensor * targets_k = ggml_view_2d(ctx,
  1188. targets,
  1189. targets->ne[0],
  1190. targets->ne[1],
  1191. targets->nb[1],
  1192. k*targets->nb[2]);
  1193. get_example_targets(example_id*n_batch + k, tokens_input_k, targets_k);
  1194. }
  1195. }
  1196. void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
  1197. int n_tokens = tokens_input->ne[0];
  1198. int n_vocab = targets->ne[0];
  1199. for (int i=0; i<n_tokens-n_shift; ++i) {
  1200. ggml_set_i32_1d(tokens_input, i, ggml_get_i32_1d(tokens_input, i + n_shift));
  1201. for (int k=0; k<n_vocab; ++k) {
  1202. ggml_set_f32_1d(targets, i*n_vocab + k, ggml_get_f32_1d(targets, (i + n_shift)*n_vocab + k));
  1203. }
  1204. }
  1205. }
  1206. struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
  1207. // todo: instead of a-b: a[1:]-b[:-1]
  1208. return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
  1209. }
  1210. struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
  1211. const float eps = 1e-3;
  1212. return
  1213. ggml_sum(ctx,
  1214. ggml_neg(ctx,
  1215. ggml_sum_rows(ctx,
  1216. ggml_mul(ctx,
  1217. ggml_soft_max(ctx, a),
  1218. ggml_log(ctx,
  1219. ggml_add1(ctx,
  1220. ggml_soft_max(ctx, b),
  1221. ggml_new_f32(ctx, eps)))))));
  1222. }
  1223. int main(int argc, char ** argv) {
  1224. if (argc < 1) {
  1225. fprintf(stderr, "usage: %s\n", argv[0]);
  1226. return 1;
  1227. }
  1228. struct ggml_init_params lcparams;
  1229. lcparams.mem_size = 1024ll*1024ll*1024ll;
  1230. lcparams.mem_buffer = NULL;
  1231. lcparams.no_alloc = false;
  1232. struct llama_model model;
  1233. model.hparams.n_vocab = 8;
  1234. model.hparams.n_ctx = 8;
  1235. model.hparams.n_embd = 32;
  1236. model.hparams.n_mult = 2;
  1237. model.hparams.n_head = 8;
  1238. model.hparams.n_layer = 1;
  1239. model.hparams.n_rot = std::min(16u, model.hparams.n_embd / model.hparams.n_head);
  1240. // model.hparams.n_embd = 32;
  1241. // model.hparams.n_mult = 2;
  1242. // model.hparams.n_head = 4;
  1243. // model.hparams.n_layer = 8;
  1244. // model.hparams.n_rot = 8;
  1245. model.ctx = ggml_init(lcparams);
  1246. printf("init model\n");
  1247. init_model(&model);
  1248. set_param_model(&model);
  1249. randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
  1250. /*
  1251. struct llama_model_lora model_lora;
  1252. // model.hparams.n_vocab = 6;
  1253. // model.hparams.n_ctx = 64;
  1254. // model.hparams.n_embd = 128;
  1255. // model.hparams.n_mult = 2;
  1256. // model.hparams.n_head = 8;
  1257. // model.hparams.n_layer = 6;
  1258. // model.hparams.n_rot = model.hparams.n_embd / model.hparams.n_head;
  1259. model_lora.hparams.n_vocab = 16;
  1260. model_lora.hparams.n_ctx = 32;
  1261. model_lora.hparams.n_embd = 256;
  1262. model_lora.hparams.n_mult = 2;
  1263. model_lora.hparams.n_head = 16;
  1264. model_lora.hparams.n_layer = 1;
  1265. model_lora.hparams.n_lora = 64;
  1266. model_lora.hparams.n_rot = MIN(16, model_lora.hparams.n_embd / model_lora.hparams.n_head);
  1267. // model.hparams.n_rot = (model.hparams.n_embd / model.hparams.n_head) / 2;
  1268. // model.hparams.n_embd = 32;
  1269. // model.hparams.n_mult = 2;
  1270. // model.hparams.n_head = 4;
  1271. // model.hparams.n_layer = 8;
  1272. // model.hparams.n_rot = 8;
  1273. model_lora.ctx = ggml_init(lcparams);
  1274. printf("init model_lora\n");
  1275. init_model_lora(&model_lora);
  1276. set_param_model_lora(&model_lora);
  1277. randomize_model_lora(&model_lora, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
  1278. */
  1279. int n_batch = 8;
  1280. // key + value cache for the self attention
  1281. struct llama_kv_cache kv_self;
  1282. printf("init_kv_cache\n");
  1283. kv_self.ctx = model.ctx;
  1284. init_kv_cache(&kv_self, &model, n_batch);
  1285. //init_kv_cache_lora(&kv_self, &model_lora);
  1286. size_t compute_size = 1024ll*1024ll*1024ll;
  1287. uint8_t * compute_addr = new uint8_t[compute_size];
  1288. int n_examples = 256;
  1289. int n_tokens = model.hparams.n_ctx;
  1290. int n_vocab = model.hparams.n_vocab;
  1291. for (int ex=0; ex<n_examples; ++ex) {
  1292. struct ggml_init_params params = {
  1293. /*.mem_size =*/ compute_size,
  1294. /*.mem_buffer =*/ compute_addr,
  1295. /*.no_alloc =*/ false,
  1296. };
  1297. struct ggml_context * ctx0 = ggml_init(params);
  1298. struct ggml_tensor * after_opt_best_samples = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
  1299. struct ggml_tensor * after_opt_probs = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
  1300. struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
  1301. struct ggml_tensor * targets = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
  1302. int n_past = 0;
  1303. ggml_cgraph gf = {};
  1304. gf.n_threads = 1;
  1305. get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
  1306. struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
  1307. // struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits);
  1308. struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
  1309. ggml_build_forward_expand(&gf, e);
  1310. ggml_graph_compute(ctx0, &gf);
  1311. float error_before_opt = ggml_get_f32_1d(e, 0);
  1312. struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
  1313. struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
  1314. opt_params_adam.print_forward_graph = false;
  1315. opt_params_adam.print_backward_graph = false;
  1316. opt_params_lbfgs.print_forward_graph = false;
  1317. opt_params_lbfgs.print_backward_graph = false;
  1318. opt_params_adam.adam.n_iter = 16;
  1319. opt_params_lbfgs.lbfgs.n_iter = 16;
  1320. // ggml_opt(ctx0, opt_params_adam, e);
  1321. ggml_opt(ctx0, opt_params_lbfgs, e);
  1322. //
  1323. ggml_build_forward_expand(&gf, e);
  1324. ggml_graph_compute(ctx0, &gf);
  1325. float error_after_opt = ggml_get_f32_1d(e, 0);
  1326. if (ex % 8 == 0) {
  1327. printf("Example %d\n", (ex+1));
  1328. printf("error_before_opt: %.2f\n", error_before_opt);
  1329. printf("error_after_opt: %.2f\n", error_after_opt);
  1330. }
  1331. if (ex % 64 == 0) {
  1332. sample_softmax_batch(ctx0, logits, after_opt_probs, after_opt_best_samples);
  1333. // printf("probabilities after optimization:\n");
  1334. // print_matrix(after_opt_probs);
  1335. printf("best samples after optimization:\n");
  1336. print_tokens(after_opt_best_samples, n_vocab);
  1337. }
  1338. ggml_free(ctx0);
  1339. }
  1340. {
  1341. int n_gen = 128;
  1342. int sample_ctx = n_tokens-n_tokens/8;
  1343. printf("Generating %d tokens.\n", n_gen);
  1344. struct ggml_tensor * tokens_input = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens);
  1345. struct ggml_tensor * targets = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens);
  1346. get_example_targets(137, tokens_input, targets);
  1347. for (int i=sample_ctx; i<n_tokens; ++i) {
  1348. ggml_set_i32_1d(tokens_input, i, n_vocab/2);
  1349. }
  1350. for (int i=0; i<sample_ctx-1; ++i) {
  1351. print_token(ggml_get_i32_1d(tokens_input, i), n_vocab);
  1352. }
  1353. printf("---\n");
  1354. for (int i=0; i<n_gen; ++i) {
  1355. struct ggml_init_params params = {
  1356. /*.mem_size =*/ compute_size,
  1357. /*.mem_buffer =*/ compute_addr,
  1358. /*.no_alloc =*/ false,
  1359. };
  1360. struct ggml_context * ctx0 = ggml_init(params);
  1361. ggml_cgraph gf = {};
  1362. gf.n_threads = 1;
  1363. int n_past = 0;
  1364. struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
  1365. ggml_build_forward_expand(&gf, logits);
  1366. ggml_graph_compute(ctx0, &gf);
  1367. struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
  1368. struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
  1369. sample_softmax(logits, probs, best_samples);
  1370. // int sample_at = n_tokens-1;
  1371. int token = ggml_get_i32_1d(best_samples, sample_ctx-1);
  1372. // print_row(probs, sample_at);
  1373. print_token(token, n_vocab);
  1374. lshift_examples(tokens_input, targets, 1);
  1375. ggml_set_i32_1d(tokens_input, 0, 0);
  1376. ggml_set_i32_1d(tokens_input, sample_ctx-1, token);
  1377. ggml_free(ctx0);
  1378. }
  1379. }
  1380. print_matrix(model.tok_embeddings);
  1381. printf("done\n");
  1382. // ggml_free(kv_self.ctx);
  1383. // ggml_free(model_lora.ctx);
  1384. ggml_free(model.ctx);
  1385. return 0;
  1386. }