convert-llama2c-to-ggml.cpp 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825
  1. #include "ggml.h"
  2. #include "llama.h"
  3. #include <unordered_map>
  4. #include <vector>
  5. #include <cassert>
  6. #include <climits>
  7. #include <cstring>
  8. #include <cstdarg>
  9. #include <ctime>
  10. #include <random>
  11. #include <stdexcept>
  12. #include <algorithm>
  13. #include <string>
  14. #if defined(_MSC_VER)
  15. #pragma warning(disable: 4244 4267) // possible loss of data
  16. #endif
  17. //////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
  18. typedef struct {
  19. int dim; // transformer dimension
  20. int hidden_dim; // for ffn layers
  21. int n_layers; // number of layers
  22. int n_heads; // number of query heads
  23. int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
  24. int vocab_size; // vocabulary size, usually 256 (byte-level)
  25. int seq_len; // max sequence length
  26. } Config;
  27. typedef struct {
  28. // token embedding table
  29. float* token_embedding_table; // (vocab_size, dim)
  30. // weights for rmsnorms
  31. float* rms_att_weight; // (layer, dim) rmsnorm weights
  32. float* rms_ffn_weight; // (layer, dim)
  33. // weights for matmuls
  34. float* wq; // (layer, dim, dim)
  35. float* wk; // (layer, dim, dim)
  36. float* wv; // (layer, dim, dim)
  37. float* wo; // (layer, dim, dim)
  38. // weights for ffn
  39. float* w1; // (layer, hidden_dim, dim)
  40. float* w2; // (layer, dim, hidden_dim)
  41. float* w3; // (layer, hidden_dim, dim)
  42. // final rmsnorm
  43. float* rms_final_weight; // (dim,)
  44. // freq_cis for RoPE relatively positional embeddings
  45. // float* freq_cis_real; // (seq_len, dim/2)
  46. // float* freq_cis_imag; // (seq_len, dim/2)
  47. // (optional) classifier weights for the logits, on the last layer
  48. //float* wcls;
  49. } TransformerWeights;
  50. void malloc_weights(TransformerWeights* w, Config* p) {
  51. // we calloc instead of malloc to keep valgrind happy
  52. w->token_embedding_table = new float[p->vocab_size * p->dim]();
  53. printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
  54. w->rms_att_weight = new float[p->n_layers * p->dim]();
  55. printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
  56. w->rms_ffn_weight = new float[p->n_layers * p->dim]();
  57. printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
  58. w->wq = new float[p->n_layers * p->dim * p->dim]();
  59. printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
  60. w->wk = new float[p->n_layers * p->dim * p->dim]();
  61. printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
  62. w->wv = new float[p->n_layers * p->dim * p->dim]();
  63. printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
  64. w->wo = new float[p->n_layers * p->dim * p->dim]();
  65. printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
  66. w->w1 = new float[p->n_layers * p->hidden_dim * p->dim]();
  67. printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
  68. w->w2 = new float[p->n_layers * p->hidden_dim * p->dim]();
  69. printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
  70. w->w3 = new float[p->n_layers * p->hidden_dim * p->dim]();
  71. printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
  72. w->rms_final_weight = new float[p->dim]();
  73. printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
  74. }
  75. int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
  76. if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
  77. if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
  78. if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
  79. if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
  80. if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
  81. if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
  82. if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
  83. if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
  84. if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
  85. if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
  86. if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
  87. return 0;
  88. }
  89. void free_weights(TransformerWeights* w) {
  90. delete w->token_embedding_table;
  91. delete w->rms_att_weight;
  92. delete w->rms_ffn_weight;
  93. delete w->wq;
  94. delete w->wk;
  95. delete w->wv;
  96. delete w->wo;
  97. delete w->w1;
  98. delete w->w2;
  99. delete w->w3;
  100. delete w->rms_final_weight;
  101. }
  102. void print_sample_weights(TransformerWeights *w){
  103. printf("----- Quick print of first of the weight vales of all the variables\n");
  104. printf("%f\n", w->token_embedding_table[0]);
  105. printf("%f\n", w->rms_att_weight[0]);
  106. printf("%f\n", w->rms_ffn_weight[0]);
  107. printf("%f\n", w->wq[0]);
  108. printf("%f\n", w->wk[0]);
  109. printf("%f\n", w->wv[0]);
  110. printf("%f\n", w->wo[0]);
  111. printf("%f\n", w->w1[0]);
  112. printf("%f\n", w->w2[0]);
  113. printf("%f\n", w->w3[0]);
  114. printf("%f\n", w->rms_att_weight[0]);
  115. }
  116. ////////////////////////////////////////////////////////////////////////////////////////////////////////////
  117. //////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.
  118. struct llama_vocab {
  119. using id = int32_t;
  120. using token = std::string;
  121. struct token_score {
  122. token tok;
  123. float score;
  124. };
  125. std::unordered_map<token, id> token_to_id;
  126. std::vector<token_score> id_to_token;
  127. };
  128. struct my_llama_hparams {
  129. uint32_t n_vocab = 32000;
  130. uint32_t n_ctx = 512; // this is provided as user input?
  131. uint32_t n_embd = 4096;
  132. uint32_t n_mult = 4;
  133. uint32_t n_head = 32;
  134. uint32_t n_layer = 32;
  135. uint32_t n_rot = 64;
  136. bool operator!=(const my_llama_hparams& other) const {
  137. return memcmp(this, &other, sizeof(my_llama_hparams));
  138. }
  139. };
  140. struct my_llama_layer {
  141. // normalization
  142. struct ggml_tensor * attention_norm;
  143. // attention
  144. struct ggml_tensor * wq;
  145. struct ggml_tensor * wk;
  146. struct ggml_tensor * wv;
  147. struct ggml_tensor * wo;
  148. // normalization
  149. struct ggml_tensor * ffn_norm;
  150. // ff
  151. struct ggml_tensor * w1;
  152. struct ggml_tensor * w2;
  153. struct ggml_tensor * w3;
  154. };
  155. struct my_llama_model {
  156. struct ggml_context * ctx = NULL;
  157. my_llama_hparams hparams;
  158. struct ggml_tensor * tok_embeddings;
  159. struct ggml_tensor * norm;
  160. struct ggml_tensor * output;
  161. std::vector<my_llama_layer> layers;
  162. uint32_t train_its = 0;
  163. uint32_t train_samples = 0;
  164. uint32_t train_tokens = 0;
  165. };
  166. struct train_params {
  167. const char * fn_vocab_model;
  168. const char * fn_llama2c_model;
  169. const char * fn_llama2c_output_model;
  170. const char * fn_train_data;
  171. const char * fn_checkpoint_in;
  172. const char * fn_checkpoint_out;
  173. const char * fn_model_out;
  174. uint32_t seed;
  175. int n_ctx;
  176. int n_embd;
  177. int n_mult;
  178. int n_head;
  179. int n_layer;
  180. int n_rotmax;
  181. int n_threads;
  182. int n_batch;
  183. int n_examples;
  184. int n_predict;
  185. int print_info_interval;
  186. int print_details_interval;
  187. bool samples_start_after_nl;
  188. bool use_adam;
  189. bool use_flash;
  190. bool use_scratch;
  191. // only adam
  192. int warmup;
  193. int cos_decay_steps;
  194. float cos_decay_restart;
  195. float cos_decay_alpha;
  196. int lbfgs_n_iter;
  197. int adam_n_iter;
  198. float adam_alpha;
  199. float adam_decay;
  200. int mem_model_gb;
  201. int mem_compute_gb;
  202. int mem_compute0_gb;
  203. int mem_compute1_gb;
  204. };
  205. uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
  206. const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
  207. return n_ff;
  208. }
  209. void print_params(struct my_llama_hparams * params) {
  210. printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
  211. printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
  212. printf("%s: n_embd: %d\n", __func__, params->n_embd);
  213. printf("%s: n_mult: %d\n", __func__, params->n_mult);
  214. printf("%s: n_head: %d\n", __func__, params->n_head);
  215. printf("%s: n_ff: %d\n", __func__, get_n_ff(params));
  216. printf("%s: n_layer: %d\n", __func__, params->n_layer);
  217. printf("%s: n_rot: %d\n", __func__, params->n_rot);
  218. }
  219. void init_model(struct my_llama_model * model) {
  220. const auto & hparams = model->hparams;
  221. const uint32_t n_embd = hparams.n_embd;
  222. const uint32_t n_layer = hparams.n_layer;
  223. const uint32_t n_vocab = hparams.n_vocab;
  224. const uint32_t n_ff = get_n_ff(&hparams);
  225. struct ggml_context * ctx = model->ctx;
  226. model->train_its = 0;
  227. model->train_samples = 0;
  228. model->train_tokens = 0;
  229. model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
  230. printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
  231. model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
  232. printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd);
  233. model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
  234. printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
  235. // printing the per-layer allocations here so we dont print in the for loop.
  236. printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
  237. printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
  238. printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
  239. printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
  240. printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer);
  241. printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
  242. printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
  243. printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
  244. ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
  245. ggml_set_name(model->norm, "norm.weight");
  246. ggml_set_name(model->output, "output.weight");
  247. model->layers.resize(n_layer);
  248. for (uint32_t i = 0; i < n_layer; ++i) {
  249. auto & layer = model->layers[i];
  250. std::string layers_i = "layers." + std::to_string(i);
  251. layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
  252. layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
  253. layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
  254. layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
  255. layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
  256. layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
  257. layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
  258. layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
  259. layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
  260. ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
  261. ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
  262. ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str());
  263. ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str());
  264. ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str());
  265. ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
  266. ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str());
  267. ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
  268. ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
  269. }
  270. }
  271. float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
  272. float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
  273. return *ptr;
  274. }
  275. int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
  276. int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
  277. return *ptr;
  278. }
  279. void print_row(struct ggml_tensor * probs, int i) {
  280. for (int k = 0; k < probs->ne[0]; ++k) {
  281. float p = get_f32_2d(probs, k, i);
  282. printf(" %f", p);
  283. }
  284. printf("\n");
  285. }
  286. void print_matrix(struct ggml_tensor * probs) {
  287. assert(probs->n_dims == 2);
  288. for (int i = 0; i < probs->ne[1]; ++i) {
  289. for (int k = 0; k < probs->ne[0]; ++k) {
  290. float p = get_f32_2d(probs, k, i);
  291. printf(" %.2f", p);
  292. }
  293. printf("\n");
  294. }
  295. }
  296. #ifdef __GNUC__
  297. #ifdef __MINGW32__
  298. __attribute__((format(gnu_printf, 1, 2)))
  299. #else
  300. __attribute__((format(printf, 1, 2)))
  301. #endif
  302. #endif
  303. static std::string format(const char * fmt, ...) {
  304. va_list ap, ap2;
  305. va_start(ap, fmt);
  306. va_copy(ap2, ap);
  307. int size = vsnprintf(NULL, 0, fmt, ap);
  308. GGML_ASSERT(size >= 0 && size < INT_MAX);
  309. std::vector<char> buf(size + 1);
  310. int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
  311. GGML_ASSERT(size2 == size);
  312. va_end(ap2);
  313. va_end(ap);
  314. return std::string(buf.data(), size);
  315. }
  316. struct llama_file {
  317. // use FILE * so we don't have to re-open the file to mmap
  318. FILE * fp;
  319. size_t size;
  320. llama_file(const char * fname, const char * mode) {
  321. fp = std::fopen(fname, mode);
  322. if (fp == NULL) {
  323. size = 0;
  324. } else {
  325. seek(0, SEEK_END);
  326. size = tell();
  327. seek(0, SEEK_SET);
  328. }
  329. }
  330. size_t tell() const {
  331. #ifdef _WIN32
  332. __int64 ret = _ftelli64(fp);
  333. #else
  334. long ret = std::ftell(fp);
  335. #endif
  336. GGML_ASSERT(ret != -1); // this really shouldn't fail
  337. return (size_t) ret;
  338. }
  339. void seek(size_t offset, int whence) {
  340. #ifdef _WIN32
  341. int ret = _fseeki64(fp, (__int64) offset, whence);
  342. #else
  343. int ret = std::fseek(fp, (long) offset, whence);
  344. #endif
  345. GGML_ASSERT(ret == 0); // same
  346. }
  347. void read_raw(void * ptr, size_t size) {
  348. if (size == 0) {
  349. return;
  350. }
  351. errno = 0;
  352. std::size_t ret = std::fread(ptr, size, 1, fp);
  353. if (ferror(fp)) {
  354. throw std::runtime_error(format("read error: %s", strerror(errno)));
  355. }
  356. if (ret != 1) {
  357. throw std::runtime_error(std::string("unexpectedly reached end of file"));
  358. }
  359. }
  360. std::uint32_t read_u32() {
  361. std::uint32_t ret;
  362. read_raw(&ret, sizeof(ret));
  363. return ret;
  364. }
  365. std::float_t read_f32() {
  366. std::float_t ret;
  367. read_raw(&ret, sizeof(ret));
  368. return ret;
  369. }
  370. std::string read_string(std::uint32_t len) {
  371. std::vector<char> chars(len);
  372. read_raw(chars.data(), len);
  373. return std::string(chars.data(), len);
  374. }
  375. void write_raw(const void * ptr, size_t size) {
  376. if (size == 0) {
  377. return;
  378. }
  379. errno = 0;
  380. size_t ret = std::fwrite(ptr, size, 1, fp);
  381. if (ret != 1) {
  382. throw std::runtime_error(format("write error: %s", strerror(errno)));
  383. }
  384. }
  385. void write_u32(std::uint32_t val) {
  386. write_raw(&val, sizeof(val));
  387. }
  388. ~llama_file() {
  389. if (fp) {
  390. std::fclose(fp);
  391. }
  392. }
  393. };
  394. void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
  395. if (tensor == NULL) {
  396. file->write_u32(0);
  397. file->write_u32(0);
  398. file->write_u32(GGML_TYPE_F32);
  399. file->seek((0-file->tell()) & 31, SEEK_CUR);
  400. return;
  401. }
  402. const char * name = ggml_get_name(tensor);
  403. uint32_t name_len = strlen(name);
  404. uint32_t nd = tensor->n_dims;
  405. uint32_t ne[4] = { (uint32_t)tensor->ne[0],
  406. (uint32_t)tensor->ne[1],
  407. (uint32_t)tensor->ne[2],
  408. (uint32_t)tensor->ne[3] };
  409. file->write_u32(nd);
  410. file->write_u32(name_len);
  411. file->write_u32(tensor->type);
  412. file->write_raw(ne, sizeof(ne[0]) * nd);
  413. file->write_raw(name, name_len);
  414. file->seek((0-file->tell()) & 31, SEEK_CUR);
  415. file->write_raw(tensor->data, ggml_nbytes(tensor));
  416. }
  417. bool is_ggml_file(const char *filename) {
  418. llama_file file(filename, "rb");
  419. if (file.size < 4) {
  420. return false;
  421. }
  422. uint32_t magic = file.read_u32();
  423. return magic == LLAMA_FILE_MAGIC;
  424. }
  425. void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
  426. // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
  427. if (is_ggml_file(filename)) {
  428. struct llama_context_params llama_params = llama_context_default_params();
  429. llama_params.vocab_only = true;
  430. struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
  431. struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
  432. std::vector<const char *> strings;
  433. std::vector<float> scores;
  434. int n_vocab = llama_n_vocab(lctx);
  435. strings.resize(n_vocab, NULL);
  436. scores.resize(n_vocab, 0);
  437. n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
  438. GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
  439. vocab->id_to_token.resize(n_vocab);
  440. for (int i=0; i<n_vocab; ++i) {
  441. std::string tok = std::string(strings[i]);
  442. float score = scores[i];
  443. vocab->id_to_token[i].tok = tok;
  444. vocab->id_to_token[i].score = score;
  445. vocab->token_to_id.emplace(tok, i);
  446. }
  447. llama_free(lctx);
  448. llama_free_model(lmodel);
  449. } else { // assume llama2.c vocabulary
  450. printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
  451. llama_file file(filename, "rb");
  452. uint32_t n_vocab = config->vocab_size;
  453. /* uint32_t max_token_length = */ file.read_u32(); // unused
  454. vocab->id_to_token.resize(n_vocab);
  455. for (uint32_t i=0; i<n_vocab; ++i) {
  456. float_t score = file.read_f32();
  457. uint32_t len = file.read_u32();
  458. std::string tok = file.read_string(len);
  459. vocab->id_to_token[i].tok = tok;
  460. vocab->id_to_token[i].score = score;
  461. vocab->token_to_id.emplace(tok, i);
  462. }
  463. }
  464. }
  465. void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * karpathy_weights){
  466. int ct;
  467. switch (gg_weights->n_dims){
  468. case 1:
  469. ct = 0;
  470. for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
  471. float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]);
  472. *ptr = karpathy_weights[ct];
  473. ct++;
  474. }
  475. break;
  476. case 2:
  477. ct = 0;
  478. for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
  479. for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
  480. float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]);
  481. *ptr = karpathy_weights[ct];
  482. ct++;
  483. }
  484. }
  485. break;
  486. case 3:
  487. ct = 0;
  488. for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) {
  489. for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
  490. for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
  491. float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]);
  492. *ptr = karpathy_weights[ct];
  493. ct++;
  494. }
  495. }
  496. }
  497. break;
  498. }
  499. }
  500. void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
  501. struct llama_file file(filename, "wb");
  502. if (file.fp == NULL) {
  503. return;
  504. }
  505. // write_magic
  506. file.write_u32(LLAMA_FILE_MAGIC); // magic
  507. file.write_u32(LLAMA_FILE_VERSION); // version
  508. // write_hparams
  509. file.write_u32(model->hparams.n_vocab);
  510. file.write_u32(model->hparams.n_embd);
  511. file.write_u32(model->hparams.n_mult);
  512. file.write_u32(model->hparams.n_head);
  513. file.write_u32(model->hparams.n_layer);
  514. file.write_u32(model->hparams.n_rot);
  515. file.write_u32(LLAMA_FTYPE_ALL_F32);
  516. // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
  517. uint32_t n_vocab = model->hparams.n_vocab;
  518. for (uint32_t i = 0; i < n_vocab; i++) {
  519. const auto & token_score = vocab->id_to_token.at(i);
  520. file.write_u32((uint32_t) token_score.tok.size());
  521. file.write_raw(token_score.tok.data(), token_score.tok.size());
  522. file.write_raw(&token_score.score, sizeof(token_score.score));
  523. }
  524. // stuff AK weights into GG weights one by one.
  525. // w->token_embedding_table -> model->tok_embeddings
  526. // float* -> struct ggml_tensor
  527. stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
  528. stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
  529. stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
  530. //print_row(model->norm, 0);
  531. // for rms-att-weight
  532. int row_length = model->hparams.n_embd;
  533. const auto & hparams = model->hparams;
  534. //int n_ff = model->hparams.n_embd;
  535. int n_ff = get_n_ff(&hparams);
  536. for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
  537. auto & layer = model->layers[i];
  538. // 1d
  539. stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
  540. stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
  541. // from 3d matrix layer x dim x dim to 2d matrix dim x dim
  542. stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]);
  543. stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
  544. stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]);
  545. stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]);
  546. stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
  547. stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
  548. stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
  549. }
  550. // write tensors
  551. write_tensor(&file, model->tok_embeddings);
  552. write_tensor(&file, model->norm);
  553. write_tensor(&file, model->output); // ?
  554. for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
  555. auto & layer = model->layers[i];
  556. write_tensor(&file, layer.attention_norm);
  557. write_tensor(&file, layer.wq);
  558. write_tensor(&file, layer.wk);
  559. write_tensor(&file, layer.wv);
  560. write_tensor(&file, layer.wo);
  561. write_tensor(&file, layer.ffn_norm);
  562. write_tensor(&file, layer.w1);
  563. write_tensor(&file, layer.w2);
  564. write_tensor(&file, layer.w3);
  565. }
  566. }
  567. struct train_params get_default_train_params() {
  568. struct train_params params;
  569. params.fn_vocab_model = "models/ggml-vocab.bin";
  570. params.fn_llama2c_output_model = "ak_llama_model.bin";
  571. params.fn_train_data = "shakespeare.txt";
  572. params.fn_checkpoint_in = "checkpoint.bin";
  573. params.fn_checkpoint_out = "checkpoint.bin";
  574. params.fn_model_out = "ggml-checkpoint-f32.bin";
  575. params.seed = -1;
  576. params.n_ctx = 128;
  577. params.n_embd = 256;
  578. params.n_mult = 256;
  579. params.n_head = 8;
  580. params.n_layer = 16;
  581. params.n_rotmax = 64;
  582. params.n_threads = 6;
  583. params.n_batch = 8;
  584. params.n_examples = 8;
  585. params.n_predict = 1024;
  586. params.print_info_interval = 1;
  587. params.print_details_interval = 2;
  588. params.samples_start_after_nl = false;
  589. params.use_adam = true;
  590. params.use_flash = true;
  591. params.use_scratch = true;
  592. // only adam
  593. params.warmup = 100;
  594. params.cos_decay_steps = 1000;
  595. params.cos_decay_restart = 1.1f;
  596. params.cos_decay_alpha = 0.0f;
  597. params.lbfgs_n_iter = 16;
  598. params.adam_n_iter = 16;
  599. params.adam_alpha = 1e-3f;
  600. params.adam_decay = 1e-3f;
  601. params.mem_model_gb = 2;
  602. params.mem_compute_gb = 24;
  603. params.mem_compute0_gb = 8;
  604. params.mem_compute1_gb = 2;
  605. return params;
  606. }
  607. void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
  608. fprintf(stderr, "usage: %s [options]\n", argv[0]);
  609. fprintf(stderr, "\n");
  610. fprintf(stderr, "options:\n");
  611. fprintf(stderr, " -h, --help show this help message and exit\n");
  612. fprintf(stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggml model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
  613. fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
  614. fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
  615. fprintf(stderr, "\n");
  616. }
  617. bool params_parse(int argc, char ** argv, struct train_params * params) {
  618. bool invalid_param = false;
  619. bool reqd_param_found = false;
  620. std::string arg;
  621. struct train_params default_params = get_default_train_params();
  622. const std::string arg_prefix = "--";
  623. for (int i = 1; i < argc; i++) {
  624. arg = argv[i];
  625. if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
  626. std::replace(arg.begin(), arg.end(), '_', '-');
  627. }
  628. if (arg == "--copy-vocab-from-model") {
  629. if (++i >= argc) {
  630. invalid_param = true;
  631. break;
  632. }
  633. params->fn_vocab_model = argv[i];
  634. } else if (arg == "--llama2c-model") {
  635. if (++i >= argc) {
  636. invalid_param = true;
  637. break;
  638. }
  639. reqd_param_found = true;
  640. params->fn_llama2c_model = argv[i];
  641. } else if (arg == "--llama2c-output-model") {
  642. if (++i >= argc) {
  643. invalid_param = true;
  644. break;
  645. }
  646. params->fn_llama2c_output_model = argv[i];
  647. } else if (arg == "-h" || arg == "--help") {
  648. print_usage(argc, argv, &default_params);
  649. exit(0);
  650. } else {
  651. fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
  652. print_usage(argc, argv, &default_params);
  653. exit(1);
  654. }
  655. }
  656. if (invalid_param) {
  657. fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
  658. print_usage(argc, argv, &default_params);
  659. exit(1);
  660. }
  661. if (!reqd_param_found){
  662. fprintf(stderr, "error: please specify a llama2.c .bin file to be converted with argument --llama2c-model\n");
  663. print_usage(argc, argv, &default_params);
  664. exit(1);
  665. }
  666. return true;
  667. }
  668. int main(int argc, char ** argv) {
  669. struct train_params params = get_default_train_params();
  670. if (!params_parse(argc, argv, &params)) {
  671. return 1;
  672. }
  673. Config config;
  674. TransformerWeights weights;
  675. {
  676. FILE *file = fopen(params.fn_llama2c_model, "rb");
  677. if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
  678. // read in the config header
  679. if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
  680. // read in the Transformer weights
  681. malloc_weights(&weights, &config);
  682. if(checkpoint_init_weights(&weights, &config, file)) { return 1; }
  683. fclose(file);
  684. }
  685. struct llama_vocab vocab;
  686. load_vocab(params.fn_vocab_model, &config, &vocab);
  687. struct my_llama_model model;
  688. model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
  689. model.hparams.n_ctx = params.n_ctx;
  690. model.hparams.n_embd = config.dim; //params.n_embd;
  691. model.hparams.n_mult = 32;//params.n_mult;
  692. model.hparams.n_head = config.n_heads; //params.n_head;
  693. model.hparams.n_layer = config.n_layers; //params.n_layer;
  694. model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
  695. print_params(&model.hparams);
  696. struct ggml_init_params lcparams;
  697. lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
  698. lcparams.mem_buffer = NULL;
  699. lcparams.no_alloc = false;
  700. model.ctx = ggml_init(lcparams);
  701. init_model(&model);
  702. save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
  703. printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
  704. ggml_free(model.ctx);
  705. free_weights(&weights);
  706. return 0;
  707. }