test-grad0.cpp 54 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608
  1. #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
  2. #include "ggml.h"
  3. #include <cmath>
  4. #include <cstdio>
  5. #include <cstdlib>
  6. #include <cassert>
  7. #if defined(_MSC_VER)
  8. #pragma warning(disable: 4244 4267) // possible loss of data
  9. #endif
  10. #if defined(__GNUC__)
  11. #pragma GCC diagnostic ignored "-Wdouble-promotion"
  12. #endif
  13. #define MAX_NARGS 3
  14. #undef MIN
  15. #undef MAX
  16. #define MIN(a, b) ((a) < (b) ? (a) : (b))
  17. #define MAX(a, b) ((a) > (b) ? (a) : (b))
  18. #define GGML_SILU_FP16
  19. //
  20. // logging
  21. //
  22. #if (GGML_DEBUG >= 1)
  23. #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
  24. #else
  25. #define GGML_PRINT_DEBUG(...)
  26. #endif
  27. #if (GGML_DEBUG >= 5)
  28. #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
  29. #else
  30. #define GGML_PRINT_DEBUG_5(...)
  31. #endif
  32. #if (GGML_DEBUG >= 10)
  33. #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
  34. #else
  35. #define GGML_PRINT_DEBUG_10(...)
  36. #endif
  37. #define GGML_PRINT(...) printf(__VA_ARGS__)
  38. static float frand(void) {
  39. return (float)rand()/(float)RAND_MAX;
  40. }
  41. static int irand(int n) {
  42. if (n == 0) return 0;
  43. return rand()%n;
  44. }
  45. static void get_random_dims(int64_t * dims, int ndims) {
  46. dims[0] = dims[1] = dims[2] = dims[3] = 1;
  47. for (int i = 0; i < ndims; i++) {
  48. dims[i] = 1 + irand(4);
  49. }
  50. }
  51. static struct ggml_tensor * get_random_tensor_f32(
  52. struct ggml_context * ctx0,
  53. int ndims,
  54. int64_t ne[],
  55. float fmin,
  56. float fmax) {
  57. struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
  58. switch (ndims) {
  59. case 1:
  60. for (int i0 = 0; i0 < ne[0]; i0++) {
  61. ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
  62. }
  63. break;
  64. case 2:
  65. for (int i1 = 0; i1 < ne[1]; i1++) {
  66. for (int i0 = 0; i0 < ne[0]; i0++) {
  67. ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  68. }
  69. }
  70. break;
  71. case 3:
  72. for (int i2 = 0; i2 < ne[2]; i2++) {
  73. for (int i1 = 0; i1 < ne[1]; i1++) {
  74. for (int i0 = 0; i0 < ne[0]; i0++) {
  75. ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  76. }
  77. }
  78. }
  79. break;
  80. case 4:
  81. for (int i3 = 0; i3 < ne[3]; i3++) {
  82. for (int i2 = 0; i2 < ne[2]; i2++) {
  83. for (int i1 = 0; i1 < ne[1]; i1++) {
  84. for (int i0 = 0; i0 < ne[0]; i0++) {
  85. ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  86. }
  87. }
  88. }
  89. }
  90. break;
  91. default:
  92. assert(false);
  93. }
  94. return result;
  95. }
  96. static struct ggml_tensor * get_random_tensor_f16(
  97. struct ggml_context * ctx0,
  98. int ndims,
  99. int64_t ne[],
  100. float fmin,
  101. float fmax) {
  102. struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F16, ndims, ne);
  103. switch (ndims) {
  104. case 1:
  105. for (int i0 = 0; i0 < ne[0]; i0++) {
  106. ((ggml_fp16_t *)result->data)[i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
  107. }
  108. break;
  109. case 2:
  110. for (int i1 = 0; i1 < ne[1]; i1++) {
  111. for (int i0 = 0; i0 < ne[0]; i0++) {
  112. ((ggml_fp16_t *)result->data)[i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
  113. }
  114. }
  115. break;
  116. case 3:
  117. for (int i2 = 0; i2 < ne[2]; i2++) {
  118. for (int i1 = 0; i1 < ne[1]; i1++) {
  119. for (int i0 = 0; i0 < ne[0]; i0++) {
  120. ((ggml_fp16_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
  121. }
  122. }
  123. }
  124. break;
  125. case 4:
  126. for (int i3 = 0; i3 < ne[3]; i3++) {
  127. for (int i2 = 0; i2 < ne[2]; i2++) {
  128. for (int i1 = 0; i1 < ne[1]; i1++) {
  129. for (int i0 = 0; i0 < ne[0]; i0++) {
  130. ((ggml_fp16_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
  131. }
  132. }
  133. }
  134. }
  135. break;
  136. default:
  137. assert(false);
  138. }
  139. return result;
  140. }
  141. static struct ggml_tensor * get_random_tensor_i32(
  142. struct ggml_context * ctx0,
  143. int ndims,
  144. int64_t ne[],
  145. int32_t imin,
  146. int32_t imax) {
  147. struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_I32, ndims, ne);
  148. switch (ndims) {
  149. case 1:
  150. for (int i0 = 0; i0 < ne[0]; i0++) {
  151. ((int32_t *)result->data)[i0] = irand(imax - imin) + imin;
  152. }
  153. break;
  154. case 2:
  155. for (int i1 = 0; i1 < ne[1]; i1++) {
  156. for (int i0 = 0; i0 < ne[0]; i0++) {
  157. ((int32_t *)result->data)[i1*ne[0] + i0] = irand(imax - imin) + imin;
  158. }
  159. }
  160. break;
  161. case 3:
  162. for (int i2 = 0; i2 < ne[2]; i2++) {
  163. for (int i1 = 0; i1 < ne[1]; i1++) {
  164. for (int i0 = 0; i0 < ne[0]; i0++) {
  165. ((int32_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
  166. }
  167. }
  168. }
  169. break;
  170. case 4:
  171. for (int i3 = 0; i3 < ne[3]; i3++) {
  172. for (int i2 = 0; i2 < ne[2]; i2++) {
  173. for (int i1 = 0; i1 < ne[1]; i1++) {
  174. for (int i0 = 0; i0 < ne[0]; i0++) {
  175. ((int32_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
  176. }
  177. }
  178. }
  179. }
  180. break;
  181. default:
  182. assert(false);
  183. }
  184. return result;
  185. }
  186. static bool check_gradient(
  187. const char * op_name,
  188. struct ggml_context * ctx0,
  189. struct ggml_tensor * x[],
  190. struct ggml_tensor * f,
  191. int ndims,
  192. int nargs,
  193. float eps,
  194. float max_error_abs,
  195. float max_error_rel) {
  196. static int n_threads = -1;
  197. if (n_threads < 0) {
  198. n_threads = GGML_DEFAULT_N_THREADS;
  199. const char *env = getenv("GGML_N_THREADS");
  200. if (env) {
  201. n_threads = atoi(env);
  202. }
  203. printf("GGML_N_THREADS = %d\n", n_threads);
  204. }
  205. struct ggml_cgraph * gf = ggml_build_forward_ctx(ctx0, f);
  206. struct ggml_cgraph * gb = ggml_new_graph(ctx0);
  207. *gb = *gf;
  208. ggml_build_backward_expand(ctx0, gf, gb, false);
  209. ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
  210. ggml_graph_reset (gf);
  211. ggml_set_f32 (f->grad, 1.0f);
  212. ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
  213. // ggml_graph_dump_dot(gf, NULL, "test-grad0-forward.dot");
  214. // ggml_graph_dump_dot(gb, gf, "test-grad0-backward.dot");
  215. for (int i = 0; i < nargs; ++i) {
  216. const int nelements = ggml_nelements(x[i]);
  217. for (int k = 0; k < nelements; ++k) {
  218. // compute gradient using finite differences
  219. const float x0 = ggml_get_f32_1d(x[i], k);
  220. const float xm = x0 - eps;
  221. const float xp = x0 + eps;
  222. ggml_set_f32_1d(x[i], k, xp);
  223. ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
  224. const double f0 = ggml_get_f32_1d(f, 0);
  225. ggml_set_f32_1d(x[i], k, xm);
  226. ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
  227. const double f1 = ggml_get_f32_1d(f, 0);
  228. const double g0 = (f0 - f1)/(2.0*(double) eps);
  229. ggml_set_f32_1d(x[i], k, x0);
  230. // compute gradient using backward graph
  231. ggml_graph_reset (gf);
  232. ggml_set_f32 (f->grad, 1.0f);
  233. ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
  234. const double g1 = ggml_get_f32_1d(x[i]->grad, k);
  235. const double error_abs = fabs(g0 - g1);
  236. const double error_rel = g0 != 0 ? fabs(g0 - g1)/fabs(g0) : 0;
  237. if (error_abs > max_error_abs || error_rel > max_error_rel) {
  238. printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
  239. op_name, ndims, i, k, x0, xm, xp, f0, f1, g0, g1, eps, error_abs, error_rel);
  240. //assert(false);
  241. return false;
  242. }
  243. }
  244. }
  245. return true;
  246. }
  247. // TODO: clean-up this ..
  248. static bool check_mat_mul(
  249. const struct ggml_tensor * y,
  250. const struct ggml_tensor * x0,
  251. const struct ggml_tensor * x1) {
  252. float * dst = (float *) y->data;
  253. float * src0 = (float *) x0->data;
  254. float * src1 = (float *) x1->data;
  255. const int nc = x0->ne[1];
  256. const int nr = x1->ne[1];
  257. const int nk = x0->ne[0];
  258. GGML_PRINT_DEBUG("check_mat_mul: nc=%d, nr=%d, nk=%d\n", nc, nr, nk);
  259. GGML_PRINT_DEBUG("x0:\n");
  260. for (int j = 0; j < x0->ne[1]; ++j) {
  261. for (int i = 0; i < x0->ne[0]; ++i) {
  262. GGML_PRINT_DEBUG("%6.3f ", src0[j*nk + i]);
  263. }
  264. GGML_PRINT_DEBUG("\n");
  265. }
  266. GGML_PRINT_DEBUG("\n");
  267. GGML_PRINT_DEBUG("x1:\n");
  268. for (int j = 0; j < x1->ne[1]; ++j) {
  269. for (int i = 0; i < x1->ne[0]; ++i) {
  270. GGML_PRINT_DEBUG("%6.3f ", src1[j*nk + i]);
  271. }
  272. GGML_PRINT_DEBUG("\n");
  273. }
  274. GGML_PRINT_DEBUG("\n");
  275. GGML_PRINT_DEBUG("y: n_dims = %d, (%lld, %lld)\n", y->n_dims, y->ne[0], y->ne[1]);
  276. for (int j = 0; j < y->ne[1]; ++j) {
  277. for (int i = 0; i < y->ne[0]; ++i) {
  278. GGML_PRINT_DEBUG("%6.3f ", dst[j*nr + i]);
  279. }
  280. GGML_PRINT_DEBUG("\n");
  281. }
  282. for (int i = 0; i < nr; ++i) {
  283. for (int j = 0; j < nc; ++j) {
  284. float sum = 0.0f;
  285. for (int k = 0; k < nk; ++k) {
  286. sum += src0[j*nk + k]*src1[i*nk + k];
  287. }
  288. if (fabsf(dst[i*nc + j] - sum) > 1e-5f) {
  289. fprintf(stderr, "check_mat_mul: dst[%d] = %f, sum = %f\n", i*nc + j, dst[i*nc + j], sum);
  290. assert(false);
  291. return false;
  292. }
  293. }
  294. }
  295. return true;
  296. }
  297. #define NUM_PERMUTATIONS (4*3*2*1)
  298. int main(int argc, const char ** argv) {
  299. struct ggml_init_params params = {
  300. /* .mem_size = */ 256*1024*1024,
  301. /* .mem_buffer = */ NULL,
  302. /* .no_alloc = */ false,
  303. };
  304. int64_t ne[4];
  305. int all_permutations[4 * NUM_PERMUTATIONS];
  306. {
  307. int count = 0;
  308. for (int ax0=0; ax0<4; ++ax0) {
  309. for (int ax1=0; ax1<4; ++ax1) {
  310. if (ax1 == ax0) continue;
  311. for (int ax2=0; ax2<4; ++ax2) {
  312. if (ax2 == ax0) continue;
  313. if (ax2 == ax1) continue;
  314. for (int ax3=0; ax3<4; ++ax3) {
  315. if (ax3 == ax0) continue;
  316. if (ax3 == ax1) continue;
  317. if (ax3 == ax2) continue;
  318. assert(count < NUM_PERMUTATIONS);
  319. all_permutations[count*4+0] = ax0;
  320. all_permutations[count*4+1] = ax1;
  321. all_permutations[count*4+2] = ax2;
  322. all_permutations[count*4+3] = ax3;
  323. ++count;
  324. }
  325. }
  326. }
  327. }
  328. }
  329. unsigned seed_iter = 1;
  330. // original loop: 1000
  331. int niter = 4;
  332. const char *env = getenv("GGML_NLOOP");
  333. if (env != NULL) {
  334. niter = atoi(env);
  335. }
  336. if (argc > 1) {
  337. niter = atoi(argv[1]);
  338. }
  339. for (int iter = 0; iter < niter; ++iter) {
  340. srand(seed_iter);
  341. seed_iter = rand();
  342. unsigned seed = rand();
  343. printf("test-grad0: iter:%d/%d\n", iter, niter);
  344. struct ggml_context * ctx0 = ggml_init(params);
  345. get_random_dims(ne, 4);
  346. struct ggml_tensor * x[MAX_NARGS];
  347. // add f32
  348. {
  349. srand(seed);
  350. const int nargs = 2;
  351. for (int ndims = 1; ndims <= 4; ++ndims) {
  352. for (int i = 0; i < nargs; ++i) {
  353. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  354. ggml_set_param(ctx0, x[i]);
  355. }
  356. struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
  357. check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
  358. }
  359. }
  360. // add f16
  361. {
  362. srand(seed);
  363. const int nargs = 2;
  364. for (int ndims = 1; ndims <= 4; ++ndims) {
  365. for (int i = 0; i < nargs; ++i) {
  366. x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
  367. ggml_set_param(ctx0, x[i]);
  368. }
  369. struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
  370. check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f);
  371. }
  372. }
  373. // sub
  374. {
  375. srand(seed);
  376. const int nargs = 2;
  377. for (int ndims = 1; ndims <= 4; ++ndims) {
  378. for (int i = 0; i < nargs; ++i) {
  379. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  380. ggml_set_param(ctx0, x[i]);
  381. }
  382. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sub(ctx0, x[0], x[1]));
  383. check_gradient("sub", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  384. }
  385. }
  386. // mul
  387. {
  388. srand(seed);
  389. const int nargs = 2;
  390. for (int ndims = 1; ndims <= 4; ++ndims) {
  391. for (int i = 0; i < nargs; ++i) {
  392. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  393. ggml_set_param(ctx0, x[i]);
  394. }
  395. struct ggml_tensor * f = ggml_sum(ctx0, ggml_mul(ctx0, x[0], x[1]));
  396. check_gradient("mul", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  397. }
  398. }
  399. // div
  400. {
  401. srand(seed);
  402. const int nargs = 2;
  403. for (int ndims = 1; ndims <= 4; ++ndims) {
  404. for (int i = 0; i < nargs; ++i) {
  405. x[i] = get_random_tensor_f32(ctx0, ndims, ne, 0.5f, 1.0f);
  406. ggml_set_param(ctx0, x[i]);
  407. }
  408. struct ggml_tensor * f = ggml_sum(ctx0, ggml_div(ctx0, x[0], x[1]));
  409. check_gradient("div", ctx0, x, f, ndims, nargs, 1e-3f, 1e-1f, 1e-1f);
  410. }
  411. }
  412. // sqr
  413. {
  414. srand(seed);
  415. const int nargs = 1;
  416. for (int ndims = 1; ndims <= 2; ++ndims) {
  417. for (int i = 0; i < nargs; ++i) {
  418. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  419. ggml_set_param(ctx0, x[i]);
  420. }
  421. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, x[0]));
  422. check_gradient("sqr", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  423. }
  424. }
  425. // sqrt
  426. {
  427. srand(seed);
  428. const int nargs = 1;
  429. for (int ndims = 1; ndims <= 2; ++ndims) {
  430. for (int i = 0; i < nargs; ++i) {
  431. x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
  432. ggml_set_param(ctx0, x[i]);
  433. }
  434. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0]));
  435. check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f);
  436. }
  437. }
  438. // log
  439. {
  440. srand(seed);
  441. const int nargs = 1;
  442. for (int ndims = 1; ndims <= 2; ++ndims) {
  443. for (int i = 0; i < nargs; ++i) {
  444. x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
  445. ggml_set_param(ctx0, x[i]);
  446. }
  447. struct ggml_tensor * f = ggml_sum(ctx0, ggml_log(ctx0, x[0]));
  448. check_gradient("log", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
  449. }
  450. }
  451. // sum
  452. {
  453. srand(seed);
  454. const int nargs = 1;
  455. for (int ndims = 1; ndims <= 2; ++ndims) {
  456. for (int i = 0; i < nargs; ++i) {
  457. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  458. ggml_set_param(ctx0, x[i]);
  459. }
  460. struct ggml_tensor * f = ggml_sum(ctx0, x[0]);
  461. check_gradient("sum", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  462. }
  463. }
  464. // sum_rows
  465. {
  466. srand(seed);
  467. const int nargs = 1;
  468. for (int ndims = 1; ndims <= 4; ++ndims) {
  469. for (int i = 0; i < nargs; ++i) {
  470. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  471. ggml_set_param(ctx0, x[i]);
  472. }
  473. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sum_rows(ctx0, x[0])));
  474. check_gradient("sum_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
  475. }
  476. }
  477. // mean, not yet fully implemented
  478. if(0)
  479. {
  480. srand(seed);
  481. const int nargs = 1;
  482. for (int ndims = 1; ndims <= 4; ++ndims) {
  483. for (int i = 0; i < nargs; ++i) {
  484. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  485. ggml_set_param(ctx0, x[i]);
  486. }
  487. struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0]));
  488. check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  489. }
  490. }
  491. // argmax
  492. if (0)
  493. {
  494. srand(seed);
  495. const int nargs = 1;
  496. for (int ndims = 1; ndims <= 4; ++ndims) {
  497. for (int i = 0; i < nargs; ++i) {
  498. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  499. ggml_set_param(ctx0, x[i]);
  500. }
  501. struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0]));
  502. check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  503. }
  504. }
  505. // repeat
  506. {
  507. srand(seed);
  508. int64_t ne2[4];
  509. get_random_dims(ne2, 4);
  510. ne2[0] = ne[0] * ne2[0];
  511. ne2[1] = ne[1] * ne2[1];
  512. ne2[2] = 1;
  513. ne2[3] = 1;
  514. const int nargs = 1;
  515. for (int ndims = 1; ndims <= 2; ++ndims) {
  516. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  517. x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  518. ggml_set_param(ctx0, x[0]);
  519. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1]))));
  520. check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
  521. }
  522. }
  523. // repeat back
  524. {
  525. srand(seed);
  526. int64_t ne2[4];
  527. get_random_dims(ne2, 4);
  528. ne2[0] = ne[0] * ne2[0];
  529. ne2[1] = ne[1] * ne2[1];
  530. ne2[2] = 1;
  531. ne2[3] = 1;
  532. const int nargs = 1;
  533. for (int ndims = 1; ndims <= 2; ++ndims) {
  534. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  535. x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  536. ggml_set_param(ctx0, x[0]);
  537. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0]))));
  538. check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
  539. }
  540. }
  541. // abs (finite differences do not work)
  542. //{
  543. // const int nargs = 1;
  544. // for (int ndims = 1; ndims <= 2; ++ndims) {
  545. // for (int i = 0; i < nargs; ++i) {
  546. // x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  547. // ggml_set_param(ctx0, x[i]);
  548. // }
  549. // struct ggml_tensor * f = ggml_sum(ctx0, ggml_abs(ctx0, x[0]));
  550. // check_gradient("abs", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-3f);
  551. // }
  552. //}
  553. // sgn
  554. {
  555. srand(seed);
  556. const int nargs = 1;
  557. for (int ndims = 1; ndims <= 4; ++ndims) {
  558. for (int i = 0; i < nargs; ++i) {
  559. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  560. ggml_set_param(ctx0, x[i]);
  561. }
  562. struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0]));
  563. check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  564. }
  565. }
  566. // neg
  567. {
  568. srand(seed);
  569. const int nargs = 1;
  570. for (int ndims = 1; ndims <= 4; ++ndims) {
  571. for (int i = 0; i < nargs; ++i) {
  572. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  573. ggml_set_param(ctx0, x[i]);
  574. }
  575. struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0]));
  576. check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  577. }
  578. }
  579. // step
  580. {
  581. srand(seed);
  582. const int nargs = 1;
  583. for (int ndims = 1; ndims <= 4; ++ndims) {
  584. for (int i = 0; i < nargs; ++i) {
  585. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  586. ggml_set_param(ctx0, x[i]);
  587. }
  588. struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0]));
  589. check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  590. }
  591. }
  592. // tanh, not yet fully implemented
  593. if(0)
  594. {
  595. srand(seed);
  596. const int nargs = 1;
  597. for (int ndims = 1; ndims <= 4; ++ndims) {
  598. for (int i = 0; i < nargs; ++i) {
  599. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  600. ggml_set_param(ctx0, x[i]);
  601. }
  602. struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0]));
  603. check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  604. }
  605. }
  606. // mul_mat
  607. {
  608. srand(seed);
  609. const int nargs = 2;
  610. for (int ndims = 2; ndims <= 4; ++ndims) {
  611. int max_nrep = (ndims >= 3) ? 2 : 1;
  612. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  613. for (int nrep2 = 1; nrep2 < max_nrep; ++nrep2) {
  614. for (int nrep3 = 1; nrep3 < max_nrep; ++nrep3) {
  615. {
  616. int64_t ne2[4];
  617. get_random_dims(ne2, 4);
  618. ne2[0] = ne[0];
  619. ne2[2] = nrep2 * ne[2];
  620. ne2[3] = nrep3 * ne[3];
  621. x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  622. }
  623. ggml_set_param(ctx0, x[0]);
  624. ggml_set_param(ctx0, x[1]);
  625. struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
  626. struct ggml_tensor * f = ggml_sum(ctx0, m);
  627. GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
  628. check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  629. if (ndims == 2) {
  630. // check_mat_mul does not support ndims > 2
  631. check_mat_mul(m, x[1], x[0]);
  632. }
  633. }
  634. }
  635. }
  636. }
  637. // elu, not yet fully implemented
  638. if(0)
  639. {
  640. srand(seed);
  641. const int nargs = 1;
  642. for (int ndims = 1; ndims <= 4; ++ndims) {
  643. for (int i = 0; i < nargs; ++i) {
  644. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  645. ggml_set_param(ctx0, x[i]);
  646. }
  647. struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0]));
  648. check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  649. }
  650. }
  651. // relu
  652. {
  653. srand(seed);
  654. const int nargs = 1;
  655. for (int ndims = 1; ndims <= 4; ++ndims) {
  656. for (int i = 0; i < nargs; ++i) {
  657. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  658. ggml_set_param(ctx0, x[i]);
  659. }
  660. struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0]));
  661. check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  662. }
  663. }
  664. // gelu, not yet fully implemented
  665. if(0)
  666. {
  667. srand(seed);
  668. const int nargs = 1;
  669. for (int ndims = 1; ndims <= 4; ++ndims) {
  670. for (int i = 0; i < nargs; ++i) {
  671. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  672. ggml_set_param(ctx0, x[i]);
  673. }
  674. struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0]));
  675. check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  676. }
  677. }
  678. // silu
  679. {
  680. srand(seed);
  681. const int nargs = 1;
  682. for (int ndims = 1; ndims <= 2; ++ndims) {
  683. for (int i = 0; i < nargs; ++i) {
  684. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  685. ggml_set_param(ctx0, x[i]);
  686. }
  687. struct ggml_tensor * f = ggml_sum(ctx0, ggml_silu(ctx0, x[0]));
  688. #ifdef GGML_SILU_FP16
  689. // due to GGML_SILU_FP16 the finite difference method will be slightly wrong -> increase error bounds.
  690. check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 0.5, INFINITY);
  691. #else
  692. check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  693. #endif
  694. }
  695. }
  696. // rms_norm
  697. {
  698. srand(seed);
  699. const int nargs = 1;
  700. for (int ndims = 1; ndims <= 2; ++ndims) {
  701. for (int i = 0; i < nargs; ++i) {
  702. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  703. ggml_set_param(ctx0, x[i]);
  704. }
  705. struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f));
  706. check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY);
  707. }
  708. }
  709. // scale
  710. {
  711. srand(seed);
  712. const int nargs = 2;
  713. int64_t ne2[4];
  714. ne2[0] = 1;
  715. for (int ndims = 1; ndims <= 2; ++ndims) {
  716. x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  717. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  718. ggml_set_param(ctx0, x[0]);
  719. ggml_set_param(ctx0, x[1]);
  720. struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], x[1]));
  721. check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  722. }
  723. }
  724. // cpy f32
  725. {
  726. srand(seed);
  727. const int nargs = 2;
  728. for (int ndims = 1; ndims <= 2; ++ndims) {
  729. for (int i = 0; i < nargs; ++i) {
  730. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  731. ggml_set_param(ctx0, x[i]);
  732. }
  733. // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
  734. struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
  735. check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  736. }
  737. }
  738. // cpy f16
  739. {
  740. srand(seed);
  741. const int nargs = 2;
  742. for (int ndims = 1; ndims <= 2; ++ndims) {
  743. for (int i = 0; i < nargs; ++i) {
  744. x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
  745. ggml_set_param(ctx0, x[i]);
  746. }
  747. // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
  748. struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
  749. check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
  750. }
  751. }
  752. // reshape (1d->nd)
  753. {
  754. srand(seed);
  755. const int nargs = 1;
  756. for (int ndims = 1; ndims <= 2; ++ndims) {
  757. int64_t ne2[4];
  758. ne2[0] = 1;
  759. ne2[1] = 1;
  760. ne2[2] = 1;
  761. ne2[3] = 1;
  762. for (int i = 0; i < ndims; ++i) {
  763. ne2[0] *= ne[i];
  764. }
  765. x[0] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  766. x[1] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  767. ggml_set_param(ctx0, x[0]);
  768. struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
  769. check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  770. }
  771. }
  772. // reshape (nd->1d)
  773. {
  774. srand(seed);
  775. const int nargs = 1;
  776. for (int ndims = 1; ndims <= 2; ++ndims) {
  777. int64_t ne2[4];
  778. ne2[0] = 1;
  779. ne2[1] = 1;
  780. ne2[2] = 1;
  781. ne2[3] = 1;
  782. for (int i = 0; i < ndims; ++i) {
  783. ne2[0] *= ne[i];
  784. }
  785. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  786. x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  787. ggml_set_param(ctx0, x[0]);
  788. struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
  789. check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  790. }
  791. }
  792. // acc 1d
  793. {
  794. srand(seed);
  795. int64_t ne2[4] = { 1, 1, 1, 1 };
  796. const int nargs = 2;
  797. for (int ndims = 1; ndims <= 4; ++ndims) {
  798. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  799. ggml_set_param(ctx0, x[0]);
  800. get_random_dims(ne2, 1);
  801. while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
  802. get_random_dims(ne2, 1);
  803. }
  804. x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  805. ggml_set_param(ctx0, x[1]);
  806. const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
  807. const int offset = irand(max_offset) * ggml_element_size(x[0]);
  808. struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
  809. check_gradient("acc 1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  810. }
  811. }
  812. // acc 2d
  813. {
  814. srand(seed);
  815. int64_t ne2[4] = { 1, 1, 1, 1 };
  816. int64_t max_offsets[4] = { 0, 0, 0, 0 };
  817. int64_t offsets[4] = { 0, 0, 0, 0 };
  818. const int nargs = 2;
  819. for (int ndims = 2; ndims <= 4; ++ndims) {
  820. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  821. ggml_set_param(ctx0, x[0]);
  822. get_random_dims(ne2, 2);
  823. while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
  824. get_random_dims(ne2, 2);
  825. }
  826. x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
  827. ggml_set_param(ctx0, x[1]);
  828. max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
  829. max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
  830. offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
  831. offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
  832. const int offset = offsets[0] + offsets[1];
  833. struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
  834. check_gradient("acc 2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  835. }
  836. }
  837. // acc 3d
  838. {
  839. srand(seed);
  840. int64_t ne2[4] = { 1, 1, 1, 1 };
  841. int64_t max_offsets[4] = { 0, 0, 0, 0 };
  842. int64_t offsets[4] = { 0, 0, 0, 0 };
  843. const int nargs = 2;
  844. for (int ndims = 3; ndims <= 4; ++ndims) {
  845. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  846. ggml_set_param(ctx0, x[0]);
  847. get_random_dims(ne2, 3);
  848. while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0]))) {
  849. get_random_dims(ne2, 3);
  850. }
  851. x[1] = get_random_tensor_f32(ctx0, 3, ne2, -1.0f, 1.0f);
  852. ggml_set_param(ctx0, x[1]);
  853. max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
  854. max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
  855. max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
  856. offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
  857. offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
  858. offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
  859. const int offset = offsets[0] + offsets[1] + offsets[2];
  860. struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
  861. check_gradient("acc 3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  862. }
  863. }
  864. // acc 4d
  865. {
  866. srand(seed);
  867. int64_t ne2[4] = { 1, 1, 1, 1 };
  868. int64_t max_offsets[4] = { 0, 0, 0, 0 };
  869. int64_t offsets[4] = { 0, 0, 0, 0 };
  870. const int nargs = 2;
  871. for (int ndims = 4; ndims <= 4; ++ndims) {
  872. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  873. ggml_set_param(ctx0, x[0]);
  874. get_random_dims(ne2, 4);
  875. while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[3] > ne[3]) || (ne2[0]*ne2[1]*ne2[2]*ne2[3] > ggml_nelements(x[0]))) {
  876. get_random_dims(ne2, 4);
  877. }
  878. x[1] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
  879. ggml_set_param(ctx0, x[1]);
  880. max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
  881. max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
  882. max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
  883. max_offsets[3] = MAX(0, x[0]->ne[3] - x[1]->ne[3]);
  884. offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
  885. offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
  886. offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
  887. offsets[3] = irand(max_offsets[3]) * x[0]->nb[3];
  888. const int offset = offsets[0] + offsets[1] + offsets[2] + offsets[3];
  889. struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
  890. check_gradient("acc 4d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  891. }
  892. }
  893. // set_1d
  894. {
  895. srand(seed);
  896. int64_t ne2[4];
  897. const int nargs = 2;
  898. for (int ndims = 1; ndims <= 4; ++ndims) {
  899. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  900. ggml_set_param(ctx0, x[0]);
  901. get_random_dims(ne2, 1);
  902. while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
  903. get_random_dims(ne2, 1);
  904. }
  905. x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  906. ggml_set_param(ctx0, x[1]);
  907. const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
  908. const int offset = irand(max_offset) * ggml_element_size(x[0]);
  909. struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_1d(ctx0, x[0], x[1], offset));
  910. check_gradient("set_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  911. }
  912. }
  913. // set_2d
  914. {
  915. srand(seed);
  916. int64_t ne2[4];
  917. int64_t max_offsets[4] = { 0, 0, 0, 0 };
  918. int64_t offsets[4] = { 0, 0, 0, 0 };
  919. const int nargs = 1;
  920. for (int ndims = 2; ndims <= 4; ++ndims) {
  921. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  922. ggml_set_param(ctx0, x[0]);
  923. get_random_dims(ne2, 2);
  924. while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
  925. get_random_dims(ne2, 2);
  926. }
  927. x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
  928. ggml_set_param(ctx0, x[1]);
  929. max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
  930. max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
  931. offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
  932. offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
  933. const int offset = offsets[0] + offsets[1];
  934. struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_2d(ctx0, x[0], x[1], x[1]->nb[1], offset));
  935. check_gradient("set_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  936. }
  937. }
  938. // view_1d
  939. {
  940. srand(seed);
  941. const int nargs = 1;
  942. for (int ndims = 1; ndims <= 4; ++ndims) {
  943. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  944. ggml_set_param(ctx0, x[0]);
  945. const int k0 = irand(ggml_nelements(x[0]));
  946. const int k1 = irand(ggml_nelements(x[0]));
  947. const int i0 = MIN(k0, k1);
  948. const int i1 = MAX(k0, k1);
  949. const int offset = i0 * sizeof(float);
  950. const int nelem = i1 - i0;
  951. struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_1d(ctx0, x[0], nelem, offset));
  952. check_gradient("view_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  953. }
  954. }
  955. // view_2d
  956. {
  957. srand(seed);
  958. int64_t ne2[4];
  959. int64_t nb2[4];
  960. const int nargs = 1;
  961. for (int ndims = 1; ndims <= 4; ++ndims) {
  962. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  963. get_random_dims(ne2, 2);
  964. while (ne2[0]*ne2[1] > ggml_nelements(x[0])) {
  965. get_random_dims(ne2, 2);
  966. }
  967. const int count = ne2[0]*ne2[1];
  968. nb2[0] = sizeof(float);
  969. nb2[1] = nb2[0]*ne2[0];
  970. ggml_set_param(ctx0, x[0]);
  971. const int max_offset = ggml_nelements(x[0]) - count;
  972. const int offset = irand(max_offset+1) * sizeof(float);
  973. struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_2d(ctx0, x[0], ne2[0], ne2[1], nb2[1], offset));
  974. check_gradient("view_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  975. }
  976. }
  977. // view_3d
  978. {
  979. srand(seed);
  980. int64_t ne2[4] = {1,1,1,1};
  981. int64_t nb2[4] = {0,0,0,0};
  982. const int nargs = 1;
  983. for (int ndims = 1; ndims <= 4; ++ndims) {
  984. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  985. get_random_dims(ne2, 3);
  986. while (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0])) {
  987. get_random_dims(ne2, 3);
  988. }
  989. const int count = ne2[0]*ne2[1]*ne2[2];
  990. nb2[0] = sizeof(float);
  991. nb2[1] = nb2[0]*ne2[0];
  992. nb2[2] = nb2[1]*ne2[1];
  993. ggml_set_param(ctx0, x[0]);
  994. const int max_offset = ggml_nelements(x[0]) - count;
  995. const int offset = irand(max_offset+1) * sizeof(float);
  996. struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_3d(ctx0, x[0], ne2[0], ne2[1], ne2[2], nb2[1], nb2[2], offset));
  997. check_gradient("view_3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  998. }
  999. }
  1000. // permute
  1001. {
  1002. srand(seed);
  1003. int64_t ne2[4];
  1004. const int nargs = 1;
  1005. for (int ndims = 1; ndims <= 4; ++ndims)
  1006. {
  1007. // ggml_permute will set axes of dimensions below n_dims to 1.
  1008. // to make ggml_permute work correctly on all axes,
  1009. // the input tensor needs maximal n_dim of 4.
  1010. for (int i=0; i<ndims; ++i) {
  1011. ne2[i] = ne[i];
  1012. }
  1013. for (int i=ndims; i<4; ++i) {
  1014. ne2[i] = 1;
  1015. }
  1016. x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
  1017. ggml_set_param(ctx0, x[0]);
  1018. const int p = irand(NUM_PERMUTATIONS);
  1019. const int ax0 = all_permutations[p*4+0];
  1020. const int ax1 = all_permutations[p*4+1];
  1021. const int ax2 = all_permutations[p*4+2];
  1022. const int ax3 = all_permutations[p*4+3];
  1023. // sum requires contiguous tensor rows
  1024. struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, x[0], ax0, ax1, ax2, ax3)));
  1025. check_gradient("permute", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  1026. }
  1027. }
  1028. // transpose
  1029. {
  1030. srand(seed);
  1031. int64_t ne2[4];
  1032. const int nargs = 1;
  1033. for (int ndims = 1; ndims <= 4; ++ndims)
  1034. {
  1035. // ggml_transpose will set axes of dimensions below n_dims to 1.
  1036. // to make ggml_transpose work correctly on all axes,
  1037. // the input tensor needs maximal n_dim of 4.
  1038. for (int i=0; i<ndims; ++i) {
  1039. ne2[i] = ne[i];
  1040. }
  1041. for (int i=ndims; i<4; ++i) {
  1042. ne2[i] = 1;
  1043. }
  1044. x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
  1045. ggml_set_param(ctx0, x[0]);
  1046. // sum requires contiguous tensor rows
  1047. struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, x[0])));
  1048. check_gradient("transpose", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  1049. }
  1050. }
  1051. // get_rows
  1052. {
  1053. srand(seed);
  1054. int64_t ne2[4] = {ne[0], ne[1], 1, 1};
  1055. int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
  1056. const int nargs = 1;
  1057. const int ndims = 2;
  1058. x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  1059. x[1] = get_random_tensor_i32(ctx0, 1, ne3, 0, ne2[1]);
  1060. ggml_set_param(ctx0, x[0]);
  1061. struct ggml_tensor * f = ggml_sum(ctx0, ggml_get_rows(ctx0, x[0], x[1]));
  1062. check_gradient("get_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  1063. }
  1064. // diag_mask_inf
  1065. {
  1066. srand(seed);
  1067. const int nargs = 1;
  1068. const int ndims = 2;
  1069. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  1070. ggml_set_param(ctx0, x[0]);
  1071. int n_past = irand(ne[0]);
  1072. struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_inf(ctx0, x[0], n_past));
  1073. check_gradient("diag_mask_inf", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  1074. }
  1075. // diag_mask_zero
  1076. {
  1077. srand(seed);
  1078. const int nargs = 1;
  1079. const int ndims = 2;
  1080. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  1081. ggml_set_param(ctx0, x[0]);
  1082. int n_past = irand(ne[0]);
  1083. struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_zero(ctx0, x[0], n_past));
  1084. check_gradient("diag_mask_zero", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  1085. }
  1086. // softmax
  1087. {
  1088. srand(seed);
  1089. const int nargs = 1;
  1090. int64_t ne2[4];
  1091. get_random_dims(ne2, 4);
  1092. for (int ndims = 1; ndims <= 3; ++ndims) {
  1093. x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  1094. ggml_set_param(ctx0, x[0]);
  1095. float eps = 1e-6f;
  1096. // dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
  1097. // instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
  1098. struct ggml_tensor * f = ggml_sum(ctx0,
  1099. ggml_log(ctx0,
  1100. ggml_add1(ctx0,
  1101. ggml_scale(ctx0,
  1102. ggml_soft_max(ctx0, x[0]),
  1103. ggml_new_f32(ctx0, 1.0f - eps)),
  1104. ggml_new_f32(ctx0, eps))));
  1105. check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY);
  1106. // NOTE: softmax forward is computed using f16 table lookup instead of using actual expf, but backward assumes actual expf.
  1107. // this may result in different gradients too finite differences.
  1108. // when this test reports errors, first try to replace the table lookup with actual expf and test again to see if just that was the cause.
  1109. // if only the table lookup causes gradients to differ this is acceptable.
  1110. }
  1111. }
  1112. // cross_entropy_loss
  1113. {
  1114. srand(seed);
  1115. const int nargs = 1;
  1116. int64_t ne2[4];
  1117. get_random_dims(ne2, 4);
  1118. for (int ndims = 1; ndims <= 4; ++ndims) {
  1119. x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -0.1f, 0.1f);
  1120. x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
  1121. // the second argument to cross_entropy_loss must sum up to 1 for each row
  1122. int nr = ggml_nrows(x[1]);
  1123. int nc = ggml_nelements(x[1]) / nr;
  1124. for (int ir = 0; ir < nr; ++ir) {
  1125. float sum = 0;
  1126. for (int ic = 0; ic < nc; ++ic) {
  1127. sum += ((float *) x[1]->data)[ic + ir*nc];
  1128. }
  1129. for (int ic = 0; ic < nc; ++ic) {
  1130. ((float *) x[1]->data)[ic + ir*nc] /= sum;
  1131. }
  1132. }
  1133. ggml_set_param(ctx0, x[0]);
  1134. struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]);
  1135. check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-4f, 1e-3f, INFINITY);
  1136. }
  1137. }
  1138. // rope f32
  1139. {
  1140. srand(seed);
  1141. const int nargs = 1;
  1142. int64_t ne2[4];
  1143. get_random_dims(ne2, 4);
  1144. ne2[0] += ne2[0] % 2;
  1145. int n_rot = ne2[0];
  1146. for (int ndims = 3; ndims <= 4; ++ndims) {
  1147. for (int mode = 0; mode < 4; ++mode) {
  1148. for (int n_past = 1; n_past < ne2[2]; ++n_past) {
  1149. x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  1150. struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
  1151. for (int i = 0; i < ne2[2]; ++i) {
  1152. ((int32_t *) p->data)[i] = n_past + i;
  1153. }
  1154. ggml_set_param(ctx0, x[0]);
  1155. const bool skip_past = (mode & 1);
  1156. if (skip_past) {
  1157. // we have no past, so this would have to work on uninitialized memory.
  1158. // we only test the gradients here;
  1159. // skip_past should have no influence on gradient computation.
  1160. // so when other modes work, we assume that this does as well.
  1161. continue;
  1162. }
  1163. struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));
  1164. GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
  1165. check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
  1166. }
  1167. }
  1168. }
  1169. }
  1170. // rope f16
  1171. {
  1172. srand(seed);
  1173. const int nargs = 1;
  1174. int64_t ne2[4];
  1175. get_random_dims(ne2, 4);
  1176. ne2[0] += ne2[0] % 2;
  1177. int n_rot = ne2[0];
  1178. for (int ndims = 3; ndims <= 4; ++ndims) {
  1179. for (int mode = 0; mode < 4; ++mode) {
  1180. for (int n_past = 1; n_past < ne2[2]; ++n_past) {
  1181. x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);
  1182. struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
  1183. for (int i = 0; i < ne2[2]; ++i) {
  1184. ((int32_t *) p->data)[i] = n_past + i;
  1185. }
  1186. ggml_set_param(ctx0, x[0]);
  1187. const bool skip_past = (mode & 1);
  1188. if (skip_past) {
  1189. // we have no past, so this would have to work on uninitialized memory.
  1190. // we only test the gradients here;
  1191. // skip_past should have no influence on gradient computation.
  1192. // so when other modes work, we assume that this does as well.
  1193. continue;
  1194. }
  1195. struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));
  1196. GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
  1197. check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
  1198. }
  1199. }
  1200. }
  1201. }
  1202. // flash_attn f32
  1203. {
  1204. srand(seed);
  1205. const int nargs = 3;
  1206. int64_t ne2[4];
  1207. get_random_dims(ne2, 4);
  1208. int64_t D = ne2[0];
  1209. int64_t N = ne2[1];
  1210. int64_t M = ne2[2] + N;
  1211. int64_t B = ne2[3];
  1212. for (int masked = 0; masked <= 1; ++masked) {
  1213. for (int ndims = 2; ndims <= 4; ++ndims) {
  1214. int max_nrep = (ndims >= 3) ? 2 : 1;
  1215. for (int nrep = 1; nrep < max_nrep; ++nrep) {
  1216. int64_t neq[4] = { D, N, B*nrep, ne[3] };
  1217. int64_t nek[4] = { D, M, B, ne[3] };
  1218. int64_t nev[4] = { M, D, B, ne[3] };
  1219. if (ndims == 2) {
  1220. neq[2] = 1; neq[3] = 1;
  1221. nek[2] = 1; nek[3] = 1;
  1222. nev[2] = 1; nev[3] = 1;
  1223. } else if (ndims == 3) {
  1224. neq[3] = 1;
  1225. nek[3] = 1;
  1226. nev[3] = 1;
  1227. }
  1228. x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
  1229. x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
  1230. x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
  1231. ggml_set_param(ctx0, x[0]);
  1232. ggml_set_param(ctx0, x[1]);
  1233. ggml_set_param(ctx0, x[2]);
  1234. struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
  1235. check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
  1236. }
  1237. }
  1238. }
  1239. }
  1240. // flash_attn f16, not yet fully implemented
  1241. if(0)
  1242. {
  1243. srand(seed);
  1244. const int nargs = 3;
  1245. int64_t ne2[4];
  1246. get_random_dims(ne2, 4);
  1247. int64_t D = ne2[0];
  1248. int64_t N = ne2[1];
  1249. int64_t M = ne2[2] + N;
  1250. int64_t B = ne2[3];
  1251. for (int masked = 0; masked <= 1; ++masked) {
  1252. for (int ndims = 2; ndims <= 4; ++ndims) {
  1253. int64_t neq[4] = { D, N, B, ne[3] };
  1254. int64_t nek[4] = { D, M, B, ne[3] };
  1255. int64_t nev[4] = { M, D, B, ne[3] };
  1256. if (ndims == 2) {
  1257. neq[2] = 1; neq[3] = 1;
  1258. nek[2] = 1; nek[3] = 1;
  1259. nev[2] = 1; nev[3] = 1;
  1260. } else if (ndims == 3) {
  1261. neq[3] = 1;
  1262. nek[3] = 1;
  1263. nev[3] = 1;
  1264. }
  1265. x[0] = get_random_tensor_f16(ctx0, ndims, neq, -0.1250f, 0.1250f);
  1266. x[1] = get_random_tensor_f16(ctx0, ndims, nek, -0.1250f, 0.1250f);
  1267. x[2] = get_random_tensor_f16(ctx0, ndims, nev, -0.1250f, 0.1250f);
  1268. ggml_set_param(ctx0, x[0]);
  1269. ggml_set_param(ctx0, x[1]);
  1270. ggml_set_param(ctx0, x[2]);
  1271. struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
  1272. check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
  1273. }
  1274. }
  1275. }
  1276. ggml_free(ctx0);
  1277. }
  1278. return 0;
  1279. }