test-grad0.cpp 54 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609
  1. #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
  2. #include "ggml.h"
  3. #include <cmath>
  4. #include <cstdio>
  5. #include <cstdlib>
  6. #include <cassert>
  7. #if defined(_MSC_VER)
  8. #pragma warning(disable: 4244 4267) // possible loss of data
  9. #endif
  10. #if defined(__GNUC__)
  11. #pragma GCC diagnostic ignored "-Wdouble-promotion"
  12. #endif
  13. #define MAX_NARGS 3
  14. #undef MIN
  15. #undef MAX
  16. #define MIN(a, b) ((a) < (b) ? (a) : (b))
  17. #define MAX(a, b) ((a) > (b) ? (a) : (b))
  18. #define GGML_SILU_FP16
  19. //
  20. // logging
  21. //
  22. #if (GGML_DEBUG >= 1)
  23. #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
  24. #else
  25. #define GGML_PRINT_DEBUG(...)
  26. #endif
  27. #if (GGML_DEBUG >= 5)
  28. #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
  29. #else
  30. #define GGML_PRINT_DEBUG_5(...)
  31. #endif
  32. #if (GGML_DEBUG >= 10)
  33. #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
  34. #else
  35. #define GGML_PRINT_DEBUG_10(...)
  36. #endif
  37. #define GGML_PRINT(...) printf(__VA_ARGS__)
  38. static float frand(void) {
  39. return (float)rand()/(float)RAND_MAX;
  40. }
  41. static int irand(int n) {
  42. if (n == 0) return 0;
  43. return rand()%n;
  44. }
  45. static void get_random_dims(int64_t * dims, int ndims) {
  46. dims[0] = dims[1] = dims[2] = dims[3] = 1;
  47. for (int i = 0; i < ndims; i++) {
  48. dims[i] = 1 + irand(4);
  49. }
  50. }
  51. static struct ggml_tensor * get_random_tensor_f32(
  52. struct ggml_context * ctx0,
  53. int ndims,
  54. int64_t ne[],
  55. float fmin,
  56. float fmax) {
  57. struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
  58. switch (ndims) {
  59. case 1:
  60. for (int i0 = 0; i0 < ne[0]; i0++) {
  61. ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
  62. }
  63. break;
  64. case 2:
  65. for (int i1 = 0; i1 < ne[1]; i1++) {
  66. for (int i0 = 0; i0 < ne[0]; i0++) {
  67. ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  68. }
  69. }
  70. break;
  71. case 3:
  72. for (int i2 = 0; i2 < ne[2]; i2++) {
  73. for (int i1 = 0; i1 < ne[1]; i1++) {
  74. for (int i0 = 0; i0 < ne[0]; i0++) {
  75. ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  76. }
  77. }
  78. }
  79. break;
  80. case 4:
  81. for (int i3 = 0; i3 < ne[3]; i3++) {
  82. for (int i2 = 0; i2 < ne[2]; i2++) {
  83. for (int i1 = 0; i1 < ne[1]; i1++) {
  84. for (int i0 = 0; i0 < ne[0]; i0++) {
  85. ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  86. }
  87. }
  88. }
  89. }
  90. break;
  91. default:
  92. assert(false);
  93. }
  94. return result;
  95. }
  96. static struct ggml_tensor * get_random_tensor_f16(
  97. struct ggml_context * ctx0,
  98. int ndims,
  99. int64_t ne[],
  100. float fmin,
  101. float fmax) {
  102. struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F16, ndims, ne);
  103. switch (ndims) {
  104. case 1:
  105. for (int i0 = 0; i0 < ne[0]; i0++) {
  106. ((ggml_fp16_t *)result->data)[i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
  107. }
  108. break;
  109. case 2:
  110. for (int i1 = 0; i1 < ne[1]; i1++) {
  111. for (int i0 = 0; i0 < ne[0]; i0++) {
  112. ((ggml_fp16_t *)result->data)[i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
  113. }
  114. }
  115. break;
  116. case 3:
  117. for (int i2 = 0; i2 < ne[2]; i2++) {
  118. for (int i1 = 0; i1 < ne[1]; i1++) {
  119. for (int i0 = 0; i0 < ne[0]; i0++) {
  120. ((ggml_fp16_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
  121. }
  122. }
  123. }
  124. break;
  125. case 4:
  126. for (int i3 = 0; i3 < ne[3]; i3++) {
  127. for (int i2 = 0; i2 < ne[2]; i2++) {
  128. for (int i1 = 0; i1 < ne[1]; i1++) {
  129. for (int i0 = 0; i0 < ne[0]; i0++) {
  130. ((ggml_fp16_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
  131. }
  132. }
  133. }
  134. }
  135. break;
  136. default:
  137. assert(false);
  138. }
  139. return result;
  140. }
  141. static struct ggml_tensor * get_random_tensor_i32(
  142. struct ggml_context * ctx0,
  143. int ndims,
  144. int64_t ne[],
  145. int32_t imin,
  146. int32_t imax) {
  147. struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_I32, ndims, ne);
  148. switch (ndims) {
  149. case 1:
  150. for (int i0 = 0; i0 < ne[0]; i0++) {
  151. ((int32_t *)result->data)[i0] = irand(imax - imin) + imin;
  152. }
  153. break;
  154. case 2:
  155. for (int i1 = 0; i1 < ne[1]; i1++) {
  156. for (int i0 = 0; i0 < ne[0]; i0++) {
  157. ((int32_t *)result->data)[i1*ne[0] + i0] = irand(imax - imin) + imin;
  158. }
  159. }
  160. break;
  161. case 3:
  162. for (int i2 = 0; i2 < ne[2]; i2++) {
  163. for (int i1 = 0; i1 < ne[1]; i1++) {
  164. for (int i0 = 0; i0 < ne[0]; i0++) {
  165. ((int32_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
  166. }
  167. }
  168. }
  169. break;
  170. case 4:
  171. for (int i3 = 0; i3 < ne[3]; i3++) {
  172. for (int i2 = 0; i2 < ne[2]; i2++) {
  173. for (int i1 = 0; i1 < ne[1]; i1++) {
  174. for (int i0 = 0; i0 < ne[0]; i0++) {
  175. ((int32_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
  176. }
  177. }
  178. }
  179. }
  180. break;
  181. default:
  182. assert(false);
  183. }
  184. return result;
  185. }
  186. static bool check_gradient(
  187. const char * op_name,
  188. struct ggml_context * ctx0,
  189. struct ggml_tensor * x[],
  190. struct ggml_tensor * f,
  191. int ndims,
  192. int nargs,
  193. float eps,
  194. float max_error_abs,
  195. float max_error_rel) {
  196. static int n_threads = -1;
  197. if (n_threads < 0) {
  198. n_threads = GGML_DEFAULT_N_THREADS;
  199. const char *env = getenv("GGML_N_THREADS");
  200. if (env) {
  201. n_threads = atoi(env);
  202. }
  203. printf("GGML_N_THREADS = %d\n", n_threads);
  204. }
  205. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
  206. struct ggml_cgraph * gb = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
  207. ggml_build_forward_expand(gf, f);
  208. ggml_graph_cpy(gf, gb);
  209. ggml_build_backward_expand(ctx0, gf, gb, false);
  210. ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
  211. ggml_graph_reset (gf);
  212. ggml_set_f32 (f->grad, 1.0f);
  213. ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
  214. // ggml_graph_dump_dot(gf, NULL, "test-grad0-forward.dot");
  215. // ggml_graph_dump_dot(gb, gf, "test-grad0-backward.dot");
  216. for (int i = 0; i < nargs; ++i) {
  217. const int nelements = ggml_nelements(x[i]);
  218. for (int k = 0; k < nelements; ++k) {
  219. // compute gradient using finite differences
  220. const float x0 = ggml_get_f32_1d(x[i], k);
  221. const float xm = x0 - eps;
  222. const float xp = x0 + eps;
  223. ggml_set_f32_1d(x[i], k, xp);
  224. ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
  225. const double f0 = ggml_get_f32_1d(f, 0);
  226. ggml_set_f32_1d(x[i], k, xm);
  227. ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
  228. const double f1 = ggml_get_f32_1d(f, 0);
  229. const double g0 = (f0 - f1)/(2.0*(double) eps);
  230. ggml_set_f32_1d(x[i], k, x0);
  231. // compute gradient using backward graph
  232. ggml_graph_reset (gf);
  233. ggml_set_f32 (f->grad, 1.0f);
  234. ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
  235. const double g1 = ggml_get_f32_1d(x[i]->grad, k);
  236. const double error_abs = fabs(g0 - g1);
  237. const double error_rel = g0 != 0 ? fabs(g0 - g1)/fabs(g0) : 0;
  238. if (error_abs > max_error_abs || error_rel > max_error_rel) {
  239. printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
  240. op_name, ndims, i, k, x0, xm, xp, f0, f1, g0, g1, eps, error_abs, error_rel);
  241. //assert(false);
  242. return false;
  243. }
  244. }
  245. }
  246. return true;
  247. }
  248. // TODO: clean-up this ..
  249. static bool check_mat_mul(
  250. const struct ggml_tensor * y,
  251. const struct ggml_tensor * x0,
  252. const struct ggml_tensor * x1) {
  253. float * dst = (float *) y->data;
  254. float * src0 = (float *) x0->data;
  255. float * src1 = (float *) x1->data;
  256. const int nc = x0->ne[1];
  257. const int nr = x1->ne[1];
  258. const int nk = x0->ne[0];
  259. GGML_PRINT_DEBUG("check_mat_mul: nc=%d, nr=%d, nk=%d\n", nc, nr, nk);
  260. GGML_PRINT_DEBUG("x0:\n");
  261. for (int j = 0; j < x0->ne[1]; ++j) {
  262. for (int i = 0; i < x0->ne[0]; ++i) {
  263. GGML_PRINT_DEBUG("%6.3f ", src0[j*nk + i]);
  264. }
  265. GGML_PRINT_DEBUG("\n");
  266. }
  267. GGML_PRINT_DEBUG("\n");
  268. GGML_PRINT_DEBUG("x1:\n");
  269. for (int j = 0; j < x1->ne[1]; ++j) {
  270. for (int i = 0; i < x1->ne[0]; ++i) {
  271. GGML_PRINT_DEBUG("%6.3f ", src1[j*nk + i]);
  272. }
  273. GGML_PRINT_DEBUG("\n");
  274. }
  275. GGML_PRINT_DEBUG("\n");
  276. GGML_PRINT_DEBUG("y: n_dims = %d, (%lld, %lld)\n", y->n_dims, y->ne[0], y->ne[1]);
  277. for (int j = 0; j < y->ne[1]; ++j) {
  278. for (int i = 0; i < y->ne[0]; ++i) {
  279. GGML_PRINT_DEBUG("%6.3f ", dst[j*nr + i]);
  280. }
  281. GGML_PRINT_DEBUG("\n");
  282. }
  283. for (int i = 0; i < nr; ++i) {
  284. for (int j = 0; j < nc; ++j) {
  285. float sum = 0.0f;
  286. for (int k = 0; k < nk; ++k) {
  287. sum += src0[j*nk + k]*src1[i*nk + k];
  288. }
  289. if (fabsf(dst[i*nc + j] - sum) > 1e-5f) {
  290. fprintf(stderr, "check_mat_mul: dst[%d] = %f, sum = %f\n", i*nc + j, dst[i*nc + j], sum);
  291. assert(false);
  292. return false;
  293. }
  294. }
  295. }
  296. return true;
  297. }
  298. #define NUM_PERMUTATIONS (4*3*2*1)
  299. int main(int argc, const char ** argv) {
  300. struct ggml_init_params params = {
  301. /* .mem_size = */ 256*1024*1024,
  302. /* .mem_buffer = */ NULL,
  303. /* .no_alloc = */ false,
  304. };
  305. int64_t ne[4];
  306. int all_permutations[4 * NUM_PERMUTATIONS];
  307. {
  308. int count = 0;
  309. for (int ax0=0; ax0<4; ++ax0) {
  310. for (int ax1=0; ax1<4; ++ax1) {
  311. if (ax1 == ax0) continue;
  312. for (int ax2=0; ax2<4; ++ax2) {
  313. if (ax2 == ax0) continue;
  314. if (ax2 == ax1) continue;
  315. for (int ax3=0; ax3<4; ++ax3) {
  316. if (ax3 == ax0) continue;
  317. if (ax3 == ax1) continue;
  318. if (ax3 == ax2) continue;
  319. assert(count < NUM_PERMUTATIONS);
  320. all_permutations[count*4+0] = ax0;
  321. all_permutations[count*4+1] = ax1;
  322. all_permutations[count*4+2] = ax2;
  323. all_permutations[count*4+3] = ax3;
  324. ++count;
  325. }
  326. }
  327. }
  328. }
  329. }
  330. unsigned seed_iter = 1;
  331. // original loop: 1000
  332. int niter = 4;
  333. const char *env = getenv("GGML_NLOOP");
  334. if (env != NULL) {
  335. niter = atoi(env);
  336. }
  337. if (argc > 1) {
  338. niter = atoi(argv[1]);
  339. }
  340. for (int iter = 0; iter < niter; ++iter) {
  341. srand(seed_iter);
  342. seed_iter = rand();
  343. unsigned seed = rand();
  344. printf("test-grad0: iter:%d/%d\n", iter, niter);
  345. struct ggml_context * ctx0 = ggml_init(params);
  346. get_random_dims(ne, 4);
  347. struct ggml_tensor * x[MAX_NARGS];
  348. // add f32
  349. {
  350. srand(seed);
  351. const int nargs = 2;
  352. for (int ndims = 1; ndims <= 4; ++ndims) {
  353. for (int i = 0; i < nargs; ++i) {
  354. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  355. ggml_set_param(ctx0, x[i]);
  356. }
  357. struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
  358. check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
  359. }
  360. }
  361. // add f16
  362. {
  363. srand(seed);
  364. const int nargs = 2;
  365. for (int ndims = 1; ndims <= 4; ++ndims) {
  366. for (int i = 0; i < nargs; ++i) {
  367. x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
  368. ggml_set_param(ctx0, x[i]);
  369. }
  370. struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
  371. check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f);
  372. }
  373. }
  374. // sub
  375. {
  376. srand(seed);
  377. const int nargs = 2;
  378. for (int ndims = 1; ndims <= 4; ++ndims) {
  379. for (int i = 0; i < nargs; ++i) {
  380. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  381. ggml_set_param(ctx0, x[i]);
  382. }
  383. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sub(ctx0, x[0], x[1]));
  384. check_gradient("sub", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  385. }
  386. }
  387. // mul
  388. {
  389. srand(seed);
  390. const int nargs = 2;
  391. for (int ndims = 1; ndims <= 4; ++ndims) {
  392. for (int i = 0; i < nargs; ++i) {
  393. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  394. ggml_set_param(ctx0, x[i]);
  395. }
  396. struct ggml_tensor * f = ggml_sum(ctx0, ggml_mul(ctx0, x[0], x[1]));
  397. check_gradient("mul", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  398. }
  399. }
  400. // div
  401. {
  402. srand(seed);
  403. const int nargs = 2;
  404. for (int ndims = 1; ndims <= 4; ++ndims) {
  405. for (int i = 0; i < nargs; ++i) {
  406. x[i] = get_random_tensor_f32(ctx0, ndims, ne, 0.5f, 1.0f);
  407. ggml_set_param(ctx0, x[i]);
  408. }
  409. struct ggml_tensor * f = ggml_sum(ctx0, ggml_div(ctx0, x[0], x[1]));
  410. check_gradient("div", ctx0, x, f, ndims, nargs, 1e-3f, 1e-1f, 1e-1f);
  411. }
  412. }
  413. // sqr
  414. {
  415. srand(seed);
  416. const int nargs = 1;
  417. for (int ndims = 1; ndims <= 2; ++ndims) {
  418. for (int i = 0; i < nargs; ++i) {
  419. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  420. ggml_set_param(ctx0, x[i]);
  421. }
  422. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, x[0]));
  423. check_gradient("sqr", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  424. }
  425. }
  426. // sqrt
  427. {
  428. srand(seed);
  429. const int nargs = 1;
  430. for (int ndims = 1; ndims <= 2; ++ndims) {
  431. for (int i = 0; i < nargs; ++i) {
  432. x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
  433. ggml_set_param(ctx0, x[i]);
  434. }
  435. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0]));
  436. check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f);
  437. }
  438. }
  439. // log
  440. {
  441. srand(seed);
  442. const int nargs = 1;
  443. for (int ndims = 1; ndims <= 2; ++ndims) {
  444. for (int i = 0; i < nargs; ++i) {
  445. x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
  446. ggml_set_param(ctx0, x[i]);
  447. }
  448. struct ggml_tensor * f = ggml_sum(ctx0, ggml_log(ctx0, x[0]));
  449. check_gradient("log", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
  450. }
  451. }
  452. // sum
  453. {
  454. srand(seed);
  455. const int nargs = 1;
  456. for (int ndims = 1; ndims <= 2; ++ndims) {
  457. for (int i = 0; i < nargs; ++i) {
  458. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  459. ggml_set_param(ctx0, x[i]);
  460. }
  461. struct ggml_tensor * f = ggml_sum(ctx0, x[0]);
  462. check_gradient("sum", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  463. }
  464. }
  465. // sum_rows
  466. {
  467. srand(seed);
  468. const int nargs = 1;
  469. for (int ndims = 1; ndims <= 4; ++ndims) {
  470. for (int i = 0; i < nargs; ++i) {
  471. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  472. ggml_set_param(ctx0, x[i]);
  473. }
  474. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sum_rows(ctx0, x[0])));
  475. check_gradient("sum_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
  476. }
  477. }
  478. // mean, not yet fully implemented
  479. if(0)
  480. {
  481. srand(seed);
  482. const int nargs = 1;
  483. for (int ndims = 1; ndims <= 4; ++ndims) {
  484. for (int i = 0; i < nargs; ++i) {
  485. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  486. ggml_set_param(ctx0, x[i]);
  487. }
  488. struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0]));
  489. check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  490. }
  491. }
  492. // argmax
  493. if (0)
  494. {
  495. srand(seed);
  496. const int nargs = 1;
  497. for (int ndims = 1; ndims <= 4; ++ndims) {
  498. for (int i = 0; i < nargs; ++i) {
  499. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  500. ggml_set_param(ctx0, x[i]);
  501. }
  502. struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0]));
  503. check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  504. }
  505. }
  506. // repeat
  507. {
  508. srand(seed);
  509. int64_t ne2[4];
  510. get_random_dims(ne2, 4);
  511. ne2[0] = ne[0] * ne2[0];
  512. ne2[1] = ne[1] * ne2[1];
  513. ne2[2] = 1;
  514. ne2[3] = 1;
  515. const int nargs = 1;
  516. for (int ndims = 1; ndims <= 2; ++ndims) {
  517. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  518. x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  519. ggml_set_param(ctx0, x[0]);
  520. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1]))));
  521. check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
  522. }
  523. }
  524. // repeat back
  525. {
  526. srand(seed);
  527. int64_t ne2[4];
  528. get_random_dims(ne2, 4);
  529. ne2[0] = ne[0] * ne2[0];
  530. ne2[1] = ne[1] * ne2[1];
  531. ne2[2] = 1;
  532. ne2[3] = 1;
  533. const int nargs = 1;
  534. for (int ndims = 1; ndims <= 2; ++ndims) {
  535. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  536. x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  537. ggml_set_param(ctx0, x[0]);
  538. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0]))));
  539. check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
  540. }
  541. }
  542. // abs (finite differences do not work)
  543. //{
  544. // const int nargs = 1;
  545. // for (int ndims = 1; ndims <= 2; ++ndims) {
  546. // for (int i = 0; i < nargs; ++i) {
  547. // x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  548. // ggml_set_param(ctx0, x[i]);
  549. // }
  550. // struct ggml_tensor * f = ggml_sum(ctx0, ggml_abs(ctx0, x[0]));
  551. // check_gradient("abs", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-3f);
  552. // }
  553. //}
  554. // sgn
  555. {
  556. srand(seed);
  557. const int nargs = 1;
  558. for (int ndims = 1; ndims <= 4; ++ndims) {
  559. for (int i = 0; i < nargs; ++i) {
  560. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  561. ggml_set_param(ctx0, x[i]);
  562. }
  563. struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0]));
  564. check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  565. }
  566. }
  567. // neg
  568. {
  569. srand(seed);
  570. const int nargs = 1;
  571. for (int ndims = 1; ndims <= 4; ++ndims) {
  572. for (int i = 0; i < nargs; ++i) {
  573. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  574. ggml_set_param(ctx0, x[i]);
  575. }
  576. struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0]));
  577. check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  578. }
  579. }
  580. // step
  581. {
  582. srand(seed);
  583. const int nargs = 1;
  584. for (int ndims = 1; ndims <= 4; ++ndims) {
  585. for (int i = 0; i < nargs; ++i) {
  586. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  587. ggml_set_param(ctx0, x[i]);
  588. }
  589. struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0]));
  590. check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  591. }
  592. }
  593. // tanh, not yet fully implemented
  594. if(0)
  595. {
  596. srand(seed);
  597. const int nargs = 1;
  598. for (int ndims = 1; ndims <= 4; ++ndims) {
  599. for (int i = 0; i < nargs; ++i) {
  600. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  601. ggml_set_param(ctx0, x[i]);
  602. }
  603. struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0]));
  604. check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  605. }
  606. }
  607. // mul_mat
  608. {
  609. srand(seed);
  610. const int nargs = 2;
  611. for (int ndims = 2; ndims <= 4; ++ndims) {
  612. int max_nrep = (ndims >= 3) ? 2 : 1;
  613. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  614. for (int nrep2 = 1; nrep2 < max_nrep; ++nrep2) {
  615. for (int nrep3 = 1; nrep3 < max_nrep; ++nrep3) {
  616. {
  617. int64_t ne2[4];
  618. get_random_dims(ne2, 4);
  619. ne2[0] = ne[0];
  620. ne2[2] = nrep2 * ne[2];
  621. ne2[3] = nrep3 * ne[3];
  622. x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  623. }
  624. ggml_set_param(ctx0, x[0]);
  625. ggml_set_param(ctx0, x[1]);
  626. struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
  627. struct ggml_tensor * f = ggml_sum(ctx0, m);
  628. GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
  629. check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  630. if (ndims == 2) {
  631. // check_mat_mul does not support ndims > 2
  632. check_mat_mul(m, x[1], x[0]);
  633. }
  634. }
  635. }
  636. }
  637. }
  638. // elu, not yet fully implemented
  639. if(0)
  640. {
  641. srand(seed);
  642. const int nargs = 1;
  643. for (int ndims = 1; ndims <= 4; ++ndims) {
  644. for (int i = 0; i < nargs; ++i) {
  645. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  646. ggml_set_param(ctx0, x[i]);
  647. }
  648. struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0]));
  649. check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  650. }
  651. }
  652. // relu
  653. {
  654. srand(seed);
  655. const int nargs = 1;
  656. for (int ndims = 1; ndims <= 4; ++ndims) {
  657. for (int i = 0; i < nargs; ++i) {
  658. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  659. ggml_set_param(ctx0, x[i]);
  660. }
  661. struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0]));
  662. check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  663. }
  664. }
  665. // gelu, not yet fully implemented
  666. if(0)
  667. {
  668. srand(seed);
  669. const int nargs = 1;
  670. for (int ndims = 1; ndims <= 4; ++ndims) {
  671. for (int i = 0; i < nargs; ++i) {
  672. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  673. ggml_set_param(ctx0, x[i]);
  674. }
  675. struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0]));
  676. check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  677. }
  678. }
  679. // silu
  680. {
  681. srand(seed);
  682. const int nargs = 1;
  683. for (int ndims = 1; ndims <= 2; ++ndims) {
  684. for (int i = 0; i < nargs; ++i) {
  685. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  686. ggml_set_param(ctx0, x[i]);
  687. }
  688. struct ggml_tensor * f = ggml_sum(ctx0, ggml_silu(ctx0, x[0]));
  689. #ifdef GGML_SILU_FP16
  690. // due to GGML_SILU_FP16 the finite difference method will be slightly wrong -> increase error bounds.
  691. check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 0.5, INFINITY);
  692. #else
  693. check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  694. #endif
  695. }
  696. }
  697. // rms_norm
  698. {
  699. srand(seed);
  700. const int nargs = 1;
  701. for (int ndims = 1; ndims <= 2; ++ndims) {
  702. for (int i = 0; i < nargs; ++i) {
  703. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  704. ggml_set_param(ctx0, x[i]);
  705. }
  706. struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f));
  707. check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY);
  708. }
  709. }
  710. // scale
  711. {
  712. srand(seed);
  713. const int nargs = 2;
  714. int64_t ne2[4];
  715. ne2[0] = 1;
  716. for (int ndims = 1; ndims <= 2; ++ndims) {
  717. x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  718. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  719. ggml_set_param(ctx0, x[0]);
  720. ggml_set_param(ctx0, x[1]);
  721. struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], x[1]));
  722. check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  723. }
  724. }
  725. // cpy f32
  726. {
  727. srand(seed);
  728. const int nargs = 2;
  729. for (int ndims = 1; ndims <= 2; ++ndims) {
  730. for (int i = 0; i < nargs; ++i) {
  731. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  732. ggml_set_param(ctx0, x[i]);
  733. }
  734. // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
  735. struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
  736. check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  737. }
  738. }
  739. // cpy f16
  740. {
  741. srand(seed);
  742. const int nargs = 2;
  743. for (int ndims = 1; ndims <= 2; ++ndims) {
  744. for (int i = 0; i < nargs; ++i) {
  745. x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
  746. ggml_set_param(ctx0, x[i]);
  747. }
  748. // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
  749. struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
  750. check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
  751. }
  752. }
  753. // reshape (1d->nd)
  754. {
  755. srand(seed);
  756. const int nargs = 1;
  757. for (int ndims = 1; ndims <= 2; ++ndims) {
  758. int64_t ne2[4];
  759. ne2[0] = 1;
  760. ne2[1] = 1;
  761. ne2[2] = 1;
  762. ne2[3] = 1;
  763. for (int i = 0; i < ndims; ++i) {
  764. ne2[0] *= ne[i];
  765. }
  766. x[0] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  767. x[1] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  768. ggml_set_param(ctx0, x[0]);
  769. struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
  770. check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  771. }
  772. }
  773. // reshape (nd->1d)
  774. {
  775. srand(seed);
  776. const int nargs = 1;
  777. for (int ndims = 1; ndims <= 2; ++ndims) {
  778. int64_t ne2[4];
  779. ne2[0] = 1;
  780. ne2[1] = 1;
  781. ne2[2] = 1;
  782. ne2[3] = 1;
  783. for (int i = 0; i < ndims; ++i) {
  784. ne2[0] *= ne[i];
  785. }
  786. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  787. x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  788. ggml_set_param(ctx0, x[0]);
  789. struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
  790. check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  791. }
  792. }
  793. // acc 1d
  794. {
  795. srand(seed);
  796. int64_t ne2[4] = { 1, 1, 1, 1 };
  797. const int nargs = 2;
  798. for (int ndims = 1; ndims <= 4; ++ndims) {
  799. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  800. ggml_set_param(ctx0, x[0]);
  801. get_random_dims(ne2, 1);
  802. while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
  803. get_random_dims(ne2, 1);
  804. }
  805. x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  806. ggml_set_param(ctx0, x[1]);
  807. const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
  808. const int offset = irand(max_offset) * ggml_element_size(x[0]);
  809. struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
  810. check_gradient("acc 1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  811. }
  812. }
  813. // acc 2d
  814. {
  815. srand(seed);
  816. int64_t ne2[4] = { 1, 1, 1, 1 };
  817. int64_t max_offsets[4] = { 0, 0, 0, 0 };
  818. int64_t offsets[4] = { 0, 0, 0, 0 };
  819. const int nargs = 2;
  820. for (int ndims = 2; ndims <= 4; ++ndims) {
  821. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  822. ggml_set_param(ctx0, x[0]);
  823. get_random_dims(ne2, 2);
  824. while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
  825. get_random_dims(ne2, 2);
  826. }
  827. x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
  828. ggml_set_param(ctx0, x[1]);
  829. max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
  830. max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
  831. offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
  832. offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
  833. const int offset = offsets[0] + offsets[1];
  834. struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
  835. check_gradient("acc 2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  836. }
  837. }
  838. // acc 3d
  839. {
  840. srand(seed);
  841. int64_t ne2[4] = { 1, 1, 1, 1 };
  842. int64_t max_offsets[4] = { 0, 0, 0, 0 };
  843. int64_t offsets[4] = { 0, 0, 0, 0 };
  844. const int nargs = 2;
  845. for (int ndims = 3; ndims <= 4; ++ndims) {
  846. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  847. ggml_set_param(ctx0, x[0]);
  848. get_random_dims(ne2, 3);
  849. while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0]))) {
  850. get_random_dims(ne2, 3);
  851. }
  852. x[1] = get_random_tensor_f32(ctx0, 3, ne2, -1.0f, 1.0f);
  853. ggml_set_param(ctx0, x[1]);
  854. max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
  855. max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
  856. max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
  857. offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
  858. offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
  859. offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
  860. const int offset = offsets[0] + offsets[1] + offsets[2];
  861. struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
  862. check_gradient("acc 3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  863. }
  864. }
  865. // acc 4d
  866. {
  867. srand(seed);
  868. int64_t ne2[4] = { 1, 1, 1, 1 };
  869. int64_t max_offsets[4] = { 0, 0, 0, 0 };
  870. int64_t offsets[4] = { 0, 0, 0, 0 };
  871. const int nargs = 2;
  872. for (int ndims = 4; ndims <= 4; ++ndims) {
  873. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  874. ggml_set_param(ctx0, x[0]);
  875. get_random_dims(ne2, 4);
  876. while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[3] > ne[3]) || (ne2[0]*ne2[1]*ne2[2]*ne2[3] > ggml_nelements(x[0]))) {
  877. get_random_dims(ne2, 4);
  878. }
  879. x[1] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
  880. ggml_set_param(ctx0, x[1]);
  881. max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
  882. max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
  883. max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
  884. max_offsets[3] = MAX(0, x[0]->ne[3] - x[1]->ne[3]);
  885. offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
  886. offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
  887. offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
  888. offsets[3] = irand(max_offsets[3]) * x[0]->nb[3];
  889. const int offset = offsets[0] + offsets[1] + offsets[2] + offsets[3];
  890. struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
  891. check_gradient("acc 4d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  892. }
  893. }
  894. // set_1d
  895. {
  896. srand(seed);
  897. int64_t ne2[4];
  898. const int nargs = 2;
  899. for (int ndims = 1; ndims <= 4; ++ndims) {
  900. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  901. ggml_set_param(ctx0, x[0]);
  902. get_random_dims(ne2, 1);
  903. while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
  904. get_random_dims(ne2, 1);
  905. }
  906. x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  907. ggml_set_param(ctx0, x[1]);
  908. const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
  909. const int offset = irand(max_offset) * ggml_element_size(x[0]);
  910. struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_1d(ctx0, x[0], x[1], offset));
  911. check_gradient("set_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  912. }
  913. }
  914. // set_2d
  915. {
  916. srand(seed);
  917. int64_t ne2[4];
  918. int64_t max_offsets[4] = { 0, 0, 0, 0 };
  919. int64_t offsets[4] = { 0, 0, 0, 0 };
  920. const int nargs = 1;
  921. for (int ndims = 2; ndims <= 4; ++ndims) {
  922. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  923. ggml_set_param(ctx0, x[0]);
  924. get_random_dims(ne2, 2);
  925. while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
  926. get_random_dims(ne2, 2);
  927. }
  928. x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
  929. ggml_set_param(ctx0, x[1]);
  930. max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
  931. max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
  932. offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
  933. offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
  934. const int offset = offsets[0] + offsets[1];
  935. struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_2d(ctx0, x[0], x[1], x[1]->nb[1], offset));
  936. check_gradient("set_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  937. }
  938. }
  939. // view_1d
  940. {
  941. srand(seed);
  942. const int nargs = 1;
  943. for (int ndims = 1; ndims <= 4; ++ndims) {
  944. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  945. ggml_set_param(ctx0, x[0]);
  946. const int k0 = irand(ggml_nelements(x[0]));
  947. const int k1 = irand(ggml_nelements(x[0]));
  948. const int i0 = MIN(k0, k1);
  949. const int i1 = MAX(k0, k1);
  950. const int offset = i0 * sizeof(float);
  951. const int nelem = i1 - i0;
  952. struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_1d(ctx0, x[0], nelem, offset));
  953. check_gradient("view_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  954. }
  955. }
  956. // view_2d
  957. {
  958. srand(seed);
  959. int64_t ne2[4];
  960. int64_t nb2[4];
  961. const int nargs = 1;
  962. for (int ndims = 1; ndims <= 4; ++ndims) {
  963. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  964. get_random_dims(ne2, 2);
  965. while (ne2[0]*ne2[1] > ggml_nelements(x[0])) {
  966. get_random_dims(ne2, 2);
  967. }
  968. const int count = ne2[0]*ne2[1];
  969. nb2[0] = sizeof(float);
  970. nb2[1] = nb2[0]*ne2[0];
  971. ggml_set_param(ctx0, x[0]);
  972. const int max_offset = ggml_nelements(x[0]) - count;
  973. const int offset = irand(max_offset+1) * sizeof(float);
  974. struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_2d(ctx0, x[0], ne2[0], ne2[1], nb2[1], offset));
  975. check_gradient("view_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  976. }
  977. }
  978. // view_3d
  979. {
  980. srand(seed);
  981. int64_t ne2[4] = {1,1,1,1};
  982. int64_t nb2[4] = {0,0,0,0};
  983. const int nargs = 1;
  984. for (int ndims = 1; ndims <= 4; ++ndims) {
  985. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  986. get_random_dims(ne2, 3);
  987. while (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0])) {
  988. get_random_dims(ne2, 3);
  989. }
  990. const int count = ne2[0]*ne2[1]*ne2[2];
  991. nb2[0] = sizeof(float);
  992. nb2[1] = nb2[0]*ne2[0];
  993. nb2[2] = nb2[1]*ne2[1];
  994. ggml_set_param(ctx0, x[0]);
  995. const int max_offset = ggml_nelements(x[0]) - count;
  996. const int offset = irand(max_offset+1) * sizeof(float);
  997. struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_3d(ctx0, x[0], ne2[0], ne2[1], ne2[2], nb2[1], nb2[2], offset));
  998. check_gradient("view_3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  999. }
  1000. }
  1001. // permute
  1002. {
  1003. srand(seed);
  1004. int64_t ne2[4];
  1005. const int nargs = 1;
  1006. for (int ndims = 1; ndims <= 4; ++ndims)
  1007. {
  1008. // ggml_permute will set axes of dimensions below n_dims to 1.
  1009. // to make ggml_permute work correctly on all axes,
  1010. // the input tensor needs maximal n_dim of 4.
  1011. for (int i=0; i<ndims; ++i) {
  1012. ne2[i] = ne[i];
  1013. }
  1014. for (int i=ndims; i<4; ++i) {
  1015. ne2[i] = 1;
  1016. }
  1017. x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
  1018. ggml_set_param(ctx0, x[0]);
  1019. const int p = irand(NUM_PERMUTATIONS);
  1020. const int ax0 = all_permutations[p*4+0];
  1021. const int ax1 = all_permutations[p*4+1];
  1022. const int ax2 = all_permutations[p*4+2];
  1023. const int ax3 = all_permutations[p*4+3];
  1024. // sum requires contiguous tensor rows
  1025. struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, x[0], ax0, ax1, ax2, ax3)));
  1026. check_gradient("permute", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  1027. }
  1028. }
  1029. // transpose
  1030. {
  1031. srand(seed);
  1032. int64_t ne2[4];
  1033. const int nargs = 1;
  1034. for (int ndims = 1; ndims <= 4; ++ndims)
  1035. {
  1036. // ggml_transpose will set axes of dimensions below n_dims to 1.
  1037. // to make ggml_transpose work correctly on all axes,
  1038. // the input tensor needs maximal n_dim of 4.
  1039. for (int i=0; i<ndims; ++i) {
  1040. ne2[i] = ne[i];
  1041. }
  1042. for (int i=ndims; i<4; ++i) {
  1043. ne2[i] = 1;
  1044. }
  1045. x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
  1046. ggml_set_param(ctx0, x[0]);
  1047. // sum requires contiguous tensor rows
  1048. struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, x[0])));
  1049. check_gradient("transpose", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  1050. }
  1051. }
  1052. // get_rows
  1053. {
  1054. srand(seed);
  1055. int64_t ne2[4] = {ne[0], ne[1], 1, 1};
  1056. int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
  1057. const int nargs = 1;
  1058. const int ndims = 2;
  1059. x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  1060. x[1] = get_random_tensor_i32(ctx0, 1, ne3, 0, ne2[1]);
  1061. ggml_set_param(ctx0, x[0]);
  1062. struct ggml_tensor * f = ggml_sum(ctx0, ggml_get_rows(ctx0, x[0], x[1]));
  1063. check_gradient("get_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  1064. }
  1065. // diag_mask_inf
  1066. {
  1067. srand(seed);
  1068. const int nargs = 1;
  1069. const int ndims = 2;
  1070. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  1071. ggml_set_param(ctx0, x[0]);
  1072. int n_past = irand(ne[0]);
  1073. struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_inf(ctx0, x[0], n_past));
  1074. check_gradient("diag_mask_inf", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  1075. }
  1076. // diag_mask_zero
  1077. {
  1078. srand(seed);
  1079. const int nargs = 1;
  1080. const int ndims = 2;
  1081. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  1082. ggml_set_param(ctx0, x[0]);
  1083. int n_past = irand(ne[0]);
  1084. struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_zero(ctx0, x[0], n_past));
  1085. check_gradient("diag_mask_zero", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  1086. }
  1087. // softmax
  1088. {
  1089. srand(seed);
  1090. const int nargs = 1;
  1091. int64_t ne2[4];
  1092. get_random_dims(ne2, 4);
  1093. for (int ndims = 1; ndims <= 3; ++ndims) {
  1094. x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  1095. ggml_set_param(ctx0, x[0]);
  1096. float eps = 1e-6f;
  1097. // dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
  1098. // instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
  1099. struct ggml_tensor * f = ggml_sum(ctx0,
  1100. ggml_log(ctx0,
  1101. ggml_add1(ctx0,
  1102. ggml_scale(ctx0,
  1103. ggml_soft_max(ctx0, x[0]),
  1104. ggml_new_f32(ctx0, 1.0f - eps)),
  1105. ggml_new_f32(ctx0, eps))));
  1106. check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY);
  1107. // NOTE: softmax forward is computed using f16 table lookup instead of using actual expf, but backward assumes actual expf.
  1108. // this may result in different gradients too finite differences.
  1109. // when this test reports errors, first try to replace the table lookup with actual expf and test again to see if just that was the cause.
  1110. // if only the table lookup causes gradients to differ this is acceptable.
  1111. }
  1112. }
  1113. // cross_entropy_loss
  1114. {
  1115. srand(seed);
  1116. const int nargs = 1;
  1117. int64_t ne2[4];
  1118. get_random_dims(ne2, 4);
  1119. for (int ndims = 1; ndims <= 4; ++ndims) {
  1120. x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -0.1f, 0.1f);
  1121. x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
  1122. // the second argument to cross_entropy_loss must sum up to 1 for each row
  1123. int nr = ggml_nrows(x[1]);
  1124. int nc = ggml_nelements(x[1]) / nr;
  1125. for (int ir = 0; ir < nr; ++ir) {
  1126. float sum = 0;
  1127. for (int ic = 0; ic < nc; ++ic) {
  1128. sum += ((float *) x[1]->data)[ic + ir*nc];
  1129. }
  1130. for (int ic = 0; ic < nc; ++ic) {
  1131. ((float *) x[1]->data)[ic + ir*nc] /= sum;
  1132. }
  1133. }
  1134. ggml_set_param(ctx0, x[0]);
  1135. struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]);
  1136. check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-4f, 1e-3f, INFINITY);
  1137. }
  1138. }
  1139. // rope f32
  1140. {
  1141. srand(seed);
  1142. const int nargs = 1;
  1143. int64_t ne2[4];
  1144. get_random_dims(ne2, 4);
  1145. ne2[0] += ne2[0] % 2;
  1146. int n_rot = ne2[0];
  1147. for (int ndims = 3; ndims <= 4; ++ndims) {
  1148. for (int mode = 0; mode < 4; ++mode) {
  1149. for (int n_past = 1; n_past < ne2[2]; ++n_past) {
  1150. x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  1151. struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
  1152. for (int i = 0; i < ne2[2]; ++i) {
  1153. ((int32_t *) p->data)[i] = n_past + i;
  1154. }
  1155. ggml_set_param(ctx0, x[0]);
  1156. const bool skip_past = (mode & 1);
  1157. if (skip_past) {
  1158. // we have no past, so this would have to work on uninitialized memory.
  1159. // we only test the gradients here;
  1160. // skip_past should have no influence on gradient computation.
  1161. // so when other modes work, we assume that this does as well.
  1162. continue;
  1163. }
  1164. struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));
  1165. GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
  1166. check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
  1167. }
  1168. }
  1169. }
  1170. }
  1171. // rope f16
  1172. {
  1173. srand(seed);
  1174. const int nargs = 1;
  1175. int64_t ne2[4];
  1176. get_random_dims(ne2, 4);
  1177. ne2[0] += ne2[0] % 2;
  1178. int n_rot = ne2[0];
  1179. for (int ndims = 3; ndims <= 4; ++ndims) {
  1180. for (int mode = 0; mode < 4; ++mode) {
  1181. for (int n_past = 1; n_past < ne2[2]; ++n_past) {
  1182. x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);
  1183. struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
  1184. for (int i = 0; i < ne2[2]; ++i) {
  1185. ((int32_t *) p->data)[i] = n_past + i;
  1186. }
  1187. ggml_set_param(ctx0, x[0]);
  1188. const bool skip_past = (mode & 1);
  1189. if (skip_past) {
  1190. // we have no past, so this would have to work on uninitialized memory.
  1191. // we only test the gradients here;
  1192. // skip_past should have no influence on gradient computation.
  1193. // so when other modes work, we assume that this does as well.
  1194. continue;
  1195. }
  1196. struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));
  1197. GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
  1198. check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
  1199. }
  1200. }
  1201. }
  1202. }
  1203. // flash_attn f32
  1204. {
  1205. srand(seed);
  1206. const int nargs = 3;
  1207. int64_t ne2[4];
  1208. get_random_dims(ne2, 4);
  1209. int64_t D = ne2[0];
  1210. int64_t N = ne2[1];
  1211. int64_t M = ne2[2] + N;
  1212. int64_t B = ne2[3];
  1213. for (int masked = 0; masked <= 1; ++masked) {
  1214. for (int ndims = 2; ndims <= 4; ++ndims) {
  1215. int max_nrep = (ndims >= 3) ? 2 : 1;
  1216. for (int nrep = 1; nrep < max_nrep; ++nrep) {
  1217. int64_t neq[4] = { D, N, B*nrep, ne[3] };
  1218. int64_t nek[4] = { D, M, B, ne[3] };
  1219. int64_t nev[4] = { M, D, B, ne[3] };
  1220. if (ndims == 2) {
  1221. neq[2] = 1; neq[3] = 1;
  1222. nek[2] = 1; nek[3] = 1;
  1223. nev[2] = 1; nev[3] = 1;
  1224. } else if (ndims == 3) {
  1225. neq[3] = 1;
  1226. nek[3] = 1;
  1227. nev[3] = 1;
  1228. }
  1229. x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
  1230. x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
  1231. x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
  1232. ggml_set_param(ctx0, x[0]);
  1233. ggml_set_param(ctx0, x[1]);
  1234. ggml_set_param(ctx0, x[2]);
  1235. struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
  1236. check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
  1237. }
  1238. }
  1239. }
  1240. }
  1241. // flash_attn f16, not yet fully implemented
  1242. if(0)
  1243. {
  1244. srand(seed);
  1245. const int nargs = 3;
  1246. int64_t ne2[4];
  1247. get_random_dims(ne2, 4);
  1248. int64_t D = ne2[0];
  1249. int64_t N = ne2[1];
  1250. int64_t M = ne2[2] + N;
  1251. int64_t B = ne2[3];
  1252. for (int masked = 0; masked <= 1; ++masked) {
  1253. for (int ndims = 2; ndims <= 4; ++ndims) {
  1254. int64_t neq[4] = { D, N, B, ne[3] };
  1255. int64_t nek[4] = { D, M, B, ne[3] };
  1256. int64_t nev[4] = { M, D, B, ne[3] };
  1257. if (ndims == 2) {
  1258. neq[2] = 1; neq[3] = 1;
  1259. nek[2] = 1; nek[3] = 1;
  1260. nev[2] = 1; nev[3] = 1;
  1261. } else if (ndims == 3) {
  1262. neq[3] = 1;
  1263. nek[3] = 1;
  1264. nev[3] = 1;
  1265. }
  1266. x[0] = get_random_tensor_f16(ctx0, ndims, neq, -0.1250f, 0.1250f);
  1267. x[1] = get_random_tensor_f16(ctx0, ndims, nek, -0.1250f, 0.1250f);
  1268. x[2] = get_random_tensor_f16(ctx0, ndims, nev, -0.1250f, 0.1250f);
  1269. ggml_set_param(ctx0, x[0]);
  1270. ggml_set_param(ctx0, x[1]);
  1271. ggml_set_param(ctx0, x[2]);
  1272. struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
  1273. check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
  1274. }
  1275. }
  1276. }
  1277. ggml_free(ctx0);
  1278. }
  1279. return 0;
  1280. }