test-grad0.cpp 50 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525
  1. #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
  2. #include "ggml.h"
  3. #include <cmath>
  4. #include <cstdio>
  5. #include <cstdlib>
  6. #include <cassert>
  7. #if defined(_MSC_VER)
  8. #pragma warning(disable: 4244 4267) // possible loss of data
  9. #endif
  10. #if defined(__GNUC__)
  11. #pragma GCC diagnostic ignored "-Wdouble-promotion"
  12. #endif
  13. #define MAX_NARGS 3
  14. #undef MIN
  15. #undef MAX
  16. #define MIN(a, b) ((a) < (b) ? (a) : (b))
  17. #define MAX(a, b) ((a) > (b) ? (a) : (b))
  18. #define GGML_SILU_FP16
  19. //
  20. // logging
  21. //
  22. #if (GGML_DEBUG >= 1)
  23. #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
  24. #else
  25. #define GGML_PRINT_DEBUG(...)
  26. #endif
  27. #if (GGML_DEBUG >= 5)
  28. #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
  29. #else
  30. #define GGML_PRINT_DEBUG_5(...)
  31. #endif
  32. #if (GGML_DEBUG >= 10)
  33. #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
  34. #else
  35. #define GGML_PRINT_DEBUG_10(...)
  36. #endif
  37. #define GGML_PRINT(...) printf(__VA_ARGS__)
  38. static float frand(void) {
  39. return (float)rand()/(float)RAND_MAX;
  40. }
  41. static int irand(int n) {
  42. if (n == 0) return 0;
  43. return rand()%n;
  44. }
  45. static void get_random_dims(int64_t * dims, int ndims) {
  46. dims[0] = dims[1] = dims[2] = dims[3] = 1;
  47. for (int i = 0; i < ndims; i++) {
  48. dims[i] = 1 + irand(4);
  49. }
  50. }
  51. static struct ggml_tensor * get_random_tensor_f32(
  52. struct ggml_context * ctx0,
  53. int ndims,
  54. int64_t ne[],
  55. float fmin,
  56. float fmax) {
  57. struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
  58. switch (ndims) {
  59. case 1:
  60. for (int i0 = 0; i0 < ne[0]; i0++) {
  61. ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
  62. }
  63. break;
  64. case 2:
  65. for (int i1 = 0; i1 < ne[1]; i1++) {
  66. for (int i0 = 0; i0 < ne[0]; i0++) {
  67. ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  68. }
  69. }
  70. break;
  71. case 3:
  72. for (int i2 = 0; i2 < ne[2]; i2++) {
  73. for (int i1 = 0; i1 < ne[1]; i1++) {
  74. for (int i0 = 0; i0 < ne[0]; i0++) {
  75. ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  76. }
  77. }
  78. }
  79. break;
  80. case 4:
  81. for (int i3 = 0; i3 < ne[3]; i3++) {
  82. for (int i2 = 0; i2 < ne[2]; i2++) {
  83. for (int i1 = 0; i1 < ne[1]; i1++) {
  84. for (int i0 = 0; i0 < ne[0]; i0++) {
  85. ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  86. }
  87. }
  88. }
  89. }
  90. break;
  91. default:
  92. assert(false);
  93. };
  94. return result;
  95. }
  96. static struct ggml_tensor * get_random_tensor_f16(
  97. struct ggml_context * ctx0,
  98. int ndims,
  99. int64_t ne[],
  100. float fmin,
  101. float fmax) {
  102. struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F16, ndims, ne);
  103. switch (ndims) {
  104. case 1:
  105. for (int i0 = 0; i0 < ne[0]; i0++) {
  106. ((ggml_fp16_t *)result->data)[i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
  107. }
  108. break;
  109. case 2:
  110. for (int i1 = 0; i1 < ne[1]; i1++) {
  111. for (int i0 = 0; i0 < ne[0]; i0++) {
  112. ((ggml_fp16_t *)result->data)[i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
  113. }
  114. }
  115. break;
  116. case 3:
  117. for (int i2 = 0; i2 < ne[2]; i2++) {
  118. for (int i1 = 0; i1 < ne[1]; i1++) {
  119. for (int i0 = 0; i0 < ne[0]; i0++) {
  120. ((ggml_fp16_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
  121. }
  122. }
  123. }
  124. break;
  125. case 4:
  126. for (int i3 = 0; i3 < ne[3]; i3++) {
  127. for (int i2 = 0; i2 < ne[2]; i2++) {
  128. for (int i1 = 0; i1 < ne[1]; i1++) {
  129. for (int i0 = 0; i0 < ne[0]; i0++) {
  130. ((ggml_fp16_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
  131. }
  132. }
  133. }
  134. }
  135. break;
  136. default:
  137. assert(false);
  138. };
  139. return result;
  140. }
  141. static struct ggml_tensor * get_random_tensor_i32(
  142. struct ggml_context * ctx0,
  143. int ndims,
  144. int64_t ne[],
  145. int32_t imin,
  146. int32_t imax) {
  147. struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_I32, ndims, ne);
  148. switch (ndims) {
  149. case 1:
  150. for (int i0 = 0; i0 < ne[0]; i0++) {
  151. ((int32_t *)result->data)[i0] = irand(imax - imin) + imin;
  152. }
  153. break;
  154. case 2:
  155. for (int i1 = 0; i1 < ne[1]; i1++) {
  156. for (int i0 = 0; i0 < ne[0]; i0++) {
  157. ((int32_t *)result->data)[i1*ne[0] + i0] = irand(imax - imin) + imin;
  158. }
  159. }
  160. break;
  161. case 3:
  162. for (int i2 = 0; i2 < ne[2]; i2++) {
  163. for (int i1 = 0; i1 < ne[1]; i1++) {
  164. for (int i0 = 0; i0 < ne[0]; i0++) {
  165. ((int32_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
  166. }
  167. }
  168. }
  169. break;
  170. case 4:
  171. for (int i3 = 0; i3 < ne[3]; i3++) {
  172. for (int i2 = 0; i2 < ne[2]; i2++) {
  173. for (int i1 = 0; i1 < ne[1]; i1++) {
  174. for (int i0 = 0; i0 < ne[0]; i0++) {
  175. ((int32_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
  176. }
  177. }
  178. }
  179. }
  180. break;
  181. default:
  182. assert(false);
  183. };
  184. return result;
  185. }
  186. static void print_elements(const char* label, const struct ggml_tensor * t) {
  187. if (!t) {
  188. printf("%s: %s = null\n", __func__, label);
  189. return;
  190. }
  191. const int nelements = ggml_nelements(t);
  192. printf("%s: %s = [", __func__, label);
  193. for (int k = 0; k < nelements; ++k) {
  194. if (k > 0) { printf(", "); }
  195. printf("%.5f", ggml_get_f32_1d(t, k));
  196. }
  197. printf("] shape: [");
  198. for (int k = 0; k < t->n_dims; ++k) {
  199. if (k > 0) { printf(", "); }
  200. printf("%d", (int)t->ne[k]);
  201. }
  202. printf("]\n");
  203. }
  204. static bool check_gradient(
  205. const char * op_name,
  206. struct ggml_context * ctx0,
  207. struct ggml_tensor * x[],
  208. struct ggml_tensor * f,
  209. int ndims,
  210. int nargs,
  211. float eps,
  212. float max_error_abs,
  213. float max_error_rel) {
  214. static int n_threads = -1;
  215. if (n_threads < 0) {
  216. n_threads = GGML_DEFAULT_N_THREADS;
  217. const char *env = getenv("GGML_N_THREADS");
  218. if (env) {
  219. n_threads = atoi(env);
  220. }
  221. printf("GGML_N_THREADS = %d\n", n_threads);
  222. }
  223. struct ggml_cgraph gf = ggml_build_forward (f);
  224. struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
  225. ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
  226. ggml_graph_reset (&gf);
  227. ggml_set_f32 (f->grad, 1.0f);
  228. ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
  229. // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
  230. // ggml_graph_dump_dot(&gb, &gf, "test-grad0-backward.dot");
  231. for (int i = 0; i < nargs; ++i) {
  232. const int nelements = ggml_nelements(x[i]);
  233. for (int k = 0; k < nelements; ++k) {
  234. // compute gradient using finite differences
  235. const float x0 = ggml_get_f32_1d(x[i], k);
  236. const float xm = x0 - eps;
  237. const float xp = x0 + eps;
  238. ggml_set_f32_1d(x[i], k, xp);
  239. ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
  240. const float f0 = ggml_get_f32_1d(f, 0);
  241. ggml_set_f32_1d(x[i], k, xm);
  242. ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
  243. const float f1 = ggml_get_f32_1d(f, 0);
  244. const float g0 = (f0 - f1)/(2.0f*eps);
  245. ggml_set_f32_1d(x[i], k, x0);
  246. // compute gradient using backward graph
  247. ggml_graph_reset (&gf);
  248. ggml_set_f32 (f->grad, 1.0f);
  249. ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
  250. const float g1 = ggml_get_f32_1d(x[i]->grad, k);
  251. const float error_abs = fabsf(g0 - g1);
  252. const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0;
  253. if (error_abs > max_error_abs || error_rel > max_error_rel) {
  254. printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
  255. op_name, ndims, i, k, x0, xm, xp, f0, f1, g0, g1, eps, error_abs, error_rel);
  256. //assert(false);
  257. return false;
  258. }
  259. }
  260. }
  261. return true;
  262. }
  263. // TODO: clean-up this ..
  264. static bool check_mat_mul(
  265. const struct ggml_tensor * y,
  266. const struct ggml_tensor * x0,
  267. const struct ggml_tensor * x1) {
  268. float * dst = (float *) y->data;
  269. float * src0 = (float *) x0->data;
  270. float * src1 = (float *) x1->data;
  271. const int nc = x0->ne[1];
  272. const int nr = x1->ne[1];
  273. const int nk = x0->ne[0];
  274. GGML_PRINT_DEBUG("check_mat_mul: nc=%d, nr=%d, nk=%d\n", nc, nr, nk);
  275. GGML_PRINT_DEBUG("x0:\n");
  276. for (int j = 0; j < x0->ne[1]; ++j) {
  277. for (int i = 0; i < x0->ne[0]; ++i) {
  278. GGML_PRINT_DEBUG("%6.3f ", src0[j*nk + i]);
  279. }
  280. GGML_PRINT_DEBUG("\n");
  281. }
  282. GGML_PRINT_DEBUG("\n");
  283. GGML_PRINT_DEBUG("x1:\n");
  284. for (int j = 0; j < x1->ne[1]; ++j) {
  285. for (int i = 0; i < x1->ne[0]; ++i) {
  286. GGML_PRINT_DEBUG("%6.3f ", src1[j*nk + i]);
  287. }
  288. GGML_PRINT_DEBUG("\n");
  289. }
  290. GGML_PRINT_DEBUG("\n");
  291. GGML_PRINT_DEBUG("y: n_dims = %d, (%lld, %lld)\n", y->n_dims, y->ne[0], y->ne[1]);
  292. for (int j = 0; j < y->ne[1]; ++j) {
  293. for (int i = 0; i < y->ne[0]; ++i) {
  294. GGML_PRINT_DEBUG("%6.3f ", dst[j*nr + i]);
  295. }
  296. GGML_PRINT_DEBUG("\n");
  297. }
  298. for (int i = 0; i < nr; ++i) {
  299. for (int j = 0; j < nc; ++j) {
  300. float sum = 0.0f;
  301. for (int k = 0; k < nk; ++k) {
  302. sum += src0[j*nk + k]*src1[i*nk + k];
  303. }
  304. if (fabsf(dst[i*nc + j] - sum) > 1e-5f) {
  305. fprintf(stderr, "check_mat_mul: dst[%d] = %f, sum = %f\n", i*nc + j, dst[i*nc + j], sum);
  306. assert(false);
  307. return false;
  308. }
  309. }
  310. }
  311. return true;
  312. }
  313. #define NUM_PERMUTATIONS (4*3*2*1)
  314. int main(int argc, const char ** argv) {
  315. struct ggml_init_params params = {
  316. /* .mem_size = */ 128*1024*1024,
  317. /* .mem_buffer = */ NULL,
  318. /* .no_alloc = */ false,
  319. };
  320. int64_t ne[4];
  321. int all_permutations[4 * NUM_PERMUTATIONS];
  322. {
  323. int count = 0;
  324. for (int ax0=0; ax0<4; ++ax0) {
  325. for (int ax1=0; ax1<4; ++ax1) {
  326. if (ax1 == ax0) continue;
  327. for (int ax2=0; ax2<4; ++ax2) {
  328. if (ax2 == ax0) continue;
  329. if (ax2 == ax1) continue;
  330. for (int ax3=0; ax3<4; ++ax3) {
  331. if (ax3 == ax0) continue;
  332. if (ax3 == ax1) continue;
  333. if (ax3 == ax2) continue;
  334. assert(count < NUM_PERMUTATIONS);
  335. all_permutations[count*4+0] = ax0;
  336. all_permutations[count*4+1] = ax1;
  337. all_permutations[count*4+2] = ax2;
  338. all_permutations[count*4+3] = ax3;
  339. ++count;
  340. }
  341. }
  342. }
  343. }
  344. }
  345. // original loop: 1000
  346. int niter = 4;
  347. const char *env = getenv("GGML_NLOOP");
  348. if (env != NULL) {
  349. niter = atoi(env);
  350. }
  351. if (argc > 1) {
  352. niter = atoi(argv[1]);
  353. }
  354. for (int iter = 0; iter < niter; ++iter) {
  355. printf("test-grad0: iter:%d/%d\n", iter, niter);
  356. struct ggml_context * ctx0 = ggml_init(params);
  357. get_random_dims(ne, 4);
  358. struct ggml_tensor * x[MAX_NARGS];
  359. // add f32
  360. {
  361. const int nargs = 2;
  362. for (int ndims = 1; ndims <= 4; ++ndims) {
  363. for (int i = 0; i < nargs; ++i) {
  364. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  365. ggml_set_param(ctx0, x[i]);
  366. }
  367. struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
  368. check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
  369. }
  370. }
  371. // add f16
  372. {
  373. const int nargs = 2;
  374. for (int ndims = 1; ndims <= 4; ++ndims) {
  375. for (int i = 0; i < nargs; ++i) {
  376. x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
  377. ggml_set_param(ctx0, x[i]);
  378. }
  379. struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
  380. check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f);
  381. }
  382. }
  383. // sub
  384. {
  385. const int nargs = 2;
  386. for (int ndims = 1; ndims <= 4; ++ndims) {
  387. for (int i = 0; i < nargs; ++i) {
  388. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  389. ggml_set_param(ctx0, x[i]);
  390. }
  391. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sub(ctx0, x[0], x[1]));
  392. check_gradient("sub", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  393. }
  394. }
  395. // mul
  396. {
  397. const int nargs = 2;
  398. for (int ndims = 1; ndims <= 4; ++ndims) {
  399. for (int i = 0; i < nargs; ++i) {
  400. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  401. ggml_set_param(ctx0, x[i]);
  402. }
  403. struct ggml_tensor * f = ggml_sum(ctx0, ggml_mul(ctx0, x[0], x[1]));
  404. check_gradient("mul", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  405. }
  406. }
  407. // div
  408. {
  409. const int nargs = 2;
  410. for (int ndims = 1; ndims <= 4; ++ndims) {
  411. for (int i = 0; i < nargs; ++i) {
  412. x[i] = get_random_tensor_f32(ctx0, ndims, ne, 0.5f, 1.0f);
  413. ggml_set_param(ctx0, x[i]);
  414. }
  415. struct ggml_tensor * f = ggml_sum(ctx0, ggml_div(ctx0, x[0], x[1]));
  416. check_gradient("div", ctx0, x, f, ndims, nargs, 1e-3f, 1e-1f, 1e-1f);
  417. }
  418. }
  419. // sqr
  420. {
  421. const int nargs = 1;
  422. for (int ndims = 1; ndims <= 2; ++ndims) {
  423. for (int i = 0; i < nargs; ++i) {
  424. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  425. ggml_set_param(ctx0, x[i]);
  426. }
  427. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, x[0]));
  428. check_gradient("sqr", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  429. }
  430. }
  431. // sqrt
  432. {
  433. const int nargs = 1;
  434. for (int ndims = 1; ndims <= 2; ++ndims) {
  435. for (int i = 0; i < nargs; ++i) {
  436. x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
  437. ggml_set_param(ctx0, x[i]);
  438. }
  439. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0]));
  440. check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
  441. }
  442. }
  443. // log
  444. {
  445. const int nargs = 1;
  446. for (int ndims = 1; ndims <= 2; ++ndims) {
  447. for (int i = 0; i < nargs; ++i) {
  448. x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
  449. ggml_set_param(ctx0, x[i]);
  450. }
  451. struct ggml_tensor * f = ggml_sum(ctx0, ggml_log(ctx0, x[0]));
  452. check_gradient("log", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
  453. }
  454. }
  455. // sum
  456. {
  457. const int nargs = 1;
  458. for (int ndims = 1; ndims <= 2; ++ndims) {
  459. for (int i = 0; i < nargs; ++i) {
  460. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  461. ggml_set_param(ctx0, x[i]);
  462. }
  463. struct ggml_tensor * f = ggml_sum(ctx0, x[0]);
  464. check_gradient("sum", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  465. }
  466. }
  467. // sum_rows
  468. {
  469. const int nargs = 1;
  470. for (int ndims = 1; ndims <= 4; ++ndims) {
  471. for (int i = 0; i < nargs; ++i) {
  472. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  473. ggml_set_param(ctx0, x[i]);
  474. }
  475. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sum_rows(ctx0, x[0])));
  476. check_gradient("sum_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
  477. }
  478. }
  479. // mean, not yet fully implemented
  480. if(0)
  481. {
  482. const int nargs = 1;
  483. for (int ndims = 1; ndims <= 4; ++ndims) {
  484. for (int i = 0; i < nargs; ++i) {
  485. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  486. ggml_set_param(ctx0, x[i]);
  487. }
  488. struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0]));
  489. check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  490. }
  491. }
  492. // argmax
  493. if (0)
  494. {
  495. const int nargs = 1;
  496. for (int ndims = 1; ndims <= 4; ++ndims) {
  497. for (int i = 0; i < nargs; ++i) {
  498. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  499. ggml_set_param(ctx0, x[i]);
  500. }
  501. struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0]));
  502. check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  503. }
  504. }
  505. // repeat
  506. {
  507. int64_t ne2[4];
  508. get_random_dims(ne2, 4);
  509. ne2[0] = ne[0] * ne2[0];
  510. ne2[1] = ne[1] * ne2[1];
  511. ne2[2] = 1;
  512. ne2[3] = 1;
  513. const int nargs = 1;
  514. for (int ndims = 1; ndims <= 2; ++ndims) {
  515. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  516. x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  517. ggml_set_param(ctx0, x[0]);
  518. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1]))));
  519. check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
  520. }
  521. }
  522. // repeat back
  523. {
  524. int64_t ne2[4];
  525. get_random_dims(ne2, 4);
  526. ne2[0] = ne[0] * ne2[0];
  527. ne2[1] = ne[1] * ne2[1];
  528. ne2[2] = 1;
  529. ne2[3] = 1;
  530. const int nargs = 1;
  531. for (int ndims = 1; ndims <= 2; ++ndims) {
  532. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  533. x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  534. ggml_set_param(ctx0, x[0]);
  535. struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0]))));
  536. check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
  537. }
  538. }
  539. // abs (finite differences do not work)
  540. //{
  541. // const int nargs = 1;
  542. // for (int ndims = 1; ndims <= 2; ++ndims) {
  543. // for (int i = 0; i < nargs; ++i) {
  544. // x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  545. // ggml_set_param(ctx0, x[i]);
  546. // }
  547. // struct ggml_tensor * f = ggml_sum(ctx0, ggml_abs(ctx0, x[0]));
  548. // check_gradient("abs", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-3f);
  549. // }
  550. //}
  551. // sgn
  552. {
  553. const int nargs = 1;
  554. for (int ndims = 1; ndims <= 4; ++ndims) {
  555. for (int i = 0; i < nargs; ++i) {
  556. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  557. ggml_set_param(ctx0, x[i]);
  558. }
  559. struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0]));
  560. check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  561. }
  562. }
  563. // neg
  564. {
  565. const int nargs = 1;
  566. for (int ndims = 1; ndims <= 4; ++ndims) {
  567. for (int i = 0; i < nargs; ++i) {
  568. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  569. ggml_set_param(ctx0, x[i]);
  570. }
  571. struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0]));
  572. check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  573. }
  574. }
  575. // step
  576. {
  577. const int nargs = 1;
  578. for (int ndims = 1; ndims <= 4; ++ndims) {
  579. for (int i = 0; i < nargs; ++i) {
  580. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  581. ggml_set_param(ctx0, x[i]);
  582. }
  583. struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0]));
  584. check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  585. }
  586. }
  587. // tanh, not yet fully implemented
  588. if(0)
  589. {
  590. const int nargs = 1;
  591. for (int ndims = 1; ndims <= 4; ++ndims) {
  592. for (int i = 0; i < nargs; ++i) {
  593. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  594. ggml_set_param(ctx0, x[i]);
  595. }
  596. struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0]));
  597. check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  598. }
  599. }
  600. // mul_mat
  601. {
  602. const int nargs = 2;
  603. for (int ndims = 2; ndims <= 2; ++ndims) {
  604. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  605. {
  606. int64_t ne2[4];
  607. get_random_dims(ne2, 4);
  608. ne2[0] = ne[0];
  609. x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  610. }
  611. ggml_set_param(ctx0, x[0]);
  612. ggml_set_param(ctx0, x[1]);
  613. struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
  614. struct ggml_tensor * f = ggml_sum(ctx0, m);
  615. GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
  616. check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  617. check_mat_mul(m, x[1], x[0]);
  618. }
  619. }
  620. // elu, not yet fully implemented
  621. if(0)
  622. {
  623. const int nargs = 1;
  624. for (int ndims = 1; ndims <= 4; ++ndims) {
  625. for (int i = 0; i < nargs; ++i) {
  626. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  627. ggml_set_param(ctx0, x[i]);
  628. }
  629. struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0]));
  630. check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  631. }
  632. }
  633. // relu
  634. {
  635. const int nargs = 1;
  636. for (int ndims = 1; ndims <= 4; ++ndims) {
  637. for (int i = 0; i < nargs; ++i) {
  638. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  639. ggml_set_param(ctx0, x[i]);
  640. }
  641. struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0]));
  642. check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  643. }
  644. }
  645. // gelu, not yet fully implemented
  646. if(0)
  647. {
  648. const int nargs = 1;
  649. for (int ndims = 1; ndims <= 4; ++ndims) {
  650. for (int i = 0; i < nargs; ++i) {
  651. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  652. ggml_set_param(ctx0, x[i]);
  653. }
  654. struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0]));
  655. check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
  656. }
  657. }
  658. // silu
  659. {
  660. const int nargs = 1;
  661. for (int ndims = 1; ndims <= 2; ++ndims) {
  662. for (int i = 0; i < nargs; ++i) {
  663. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  664. ggml_set_param(ctx0, x[i]);
  665. }
  666. struct ggml_tensor * f = ggml_sum(ctx0, ggml_silu(ctx0, x[0]));
  667. #ifdef GGML_SILU_FP16
  668. // due to GGML_SILU_FP16 the finite difference method will be slightly wrong -> increase error bounds.
  669. check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 0.5, INFINITY);
  670. #else
  671. check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  672. #endif
  673. }
  674. }
  675. // rms_norm
  676. {
  677. const int nargs = 1;
  678. for (int ndims = 1; ndims <= 2; ++ndims) {
  679. for (int i = 0; i < nargs; ++i) {
  680. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  681. ggml_set_param(ctx0, x[i]);
  682. }
  683. struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f));
  684. check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY);
  685. }
  686. }
  687. // scale
  688. {
  689. const int nargs = 2;
  690. int64_t ne2[4];
  691. ne2[0] = 1;
  692. for (int ndims = 1; ndims <= 2; ++ndims) {
  693. x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  694. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  695. ggml_set_param(ctx0, x[0]);
  696. ggml_set_param(ctx0, x[1]);
  697. struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], x[1]));
  698. check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  699. }
  700. }
  701. // cpy f32
  702. {
  703. const int nargs = 2;
  704. for (int ndims = 1; ndims <= 2; ++ndims) {
  705. for (int i = 0; i < nargs; ++i) {
  706. x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  707. ggml_set_param(ctx0, x[i]);
  708. }
  709. // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
  710. struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
  711. check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  712. }
  713. }
  714. // cpy f16
  715. {
  716. const int nargs = 2;
  717. for (int ndims = 1; ndims <= 2; ++ndims) {
  718. for (int i = 0; i < nargs; ++i) {
  719. x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
  720. ggml_set_param(ctx0, x[i]);
  721. }
  722. // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
  723. struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
  724. check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
  725. }
  726. }
  727. // reshape (1d->nd)
  728. {
  729. const int nargs = 1;
  730. for (int ndims = 1; ndims <= 2; ++ndims) {
  731. int64_t ne2[4];
  732. ne2[0] = 1;
  733. ne2[1] = 1;
  734. ne2[2] = 1;
  735. ne2[3] = 1;
  736. for (int i = 0; i < ndims; ++i) {
  737. ne2[0] *= ne[i];
  738. }
  739. x[0] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  740. x[1] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  741. ggml_set_param(ctx0, x[0]);
  742. struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
  743. check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  744. }
  745. }
  746. // reshape (nd->1d)
  747. {
  748. const int nargs = 1;
  749. for (int ndims = 1; ndims <= 2; ++ndims) {
  750. int64_t ne2[4];
  751. ne2[0] = 1;
  752. ne2[1] = 1;
  753. ne2[2] = 1;
  754. ne2[3] = 1;
  755. for (int i = 0; i < ndims; ++i) {
  756. ne2[0] *= ne[i];
  757. }
  758. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  759. x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  760. ggml_set_param(ctx0, x[0]);
  761. struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
  762. check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  763. }
  764. }
  765. // acc 1d
  766. {
  767. int64_t ne2[4] = { 1, 1, 1, 1 };
  768. const int nargs = 2;
  769. for (int ndims = 1; ndims <= 4; ++ndims) {
  770. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  771. ggml_set_param(ctx0, x[0]);
  772. get_random_dims(ne2, 1);
  773. while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
  774. get_random_dims(ne2, 1);
  775. }
  776. x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  777. ggml_set_param(ctx0, x[1]);
  778. const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
  779. const int offset = irand(max_offset) * ggml_element_size(x[0]);
  780. struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
  781. check_gradient("acc 1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  782. }
  783. }
  784. // acc 2d
  785. {
  786. int64_t ne2[4] = { 1, 1, 1, 1 };
  787. int64_t max_offsets[4] = { 0, 0, 0, 0 };
  788. int64_t offsets[4] = { 0, 0, 0, 0 };
  789. const int nargs = 2;
  790. for (int ndims = 2; ndims <= 4; ++ndims) {
  791. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  792. ggml_set_param(ctx0, x[0]);
  793. get_random_dims(ne2, 2);
  794. while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
  795. get_random_dims(ne2, 2);
  796. }
  797. x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
  798. ggml_set_param(ctx0, x[1]);
  799. max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
  800. max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
  801. offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
  802. offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
  803. const int offset = offsets[0] + offsets[1];
  804. struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
  805. check_gradient("acc 2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  806. }
  807. }
  808. // acc 3d
  809. {
  810. int64_t ne2[4] = { 1, 1, 1, 1 };
  811. int64_t max_offsets[4] = { 0, 0, 0, 0 };
  812. int64_t offsets[4] = { 0, 0, 0, 0 };
  813. const int nargs = 2;
  814. for (int ndims = 3; ndims <= 4; ++ndims) {
  815. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  816. ggml_set_param(ctx0, x[0]);
  817. get_random_dims(ne2, 3);
  818. while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0]))) {
  819. get_random_dims(ne2, 3);
  820. }
  821. x[1] = get_random_tensor_f32(ctx0, 3, ne2, -1.0f, 1.0f);
  822. ggml_set_param(ctx0, x[1]);
  823. max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
  824. max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
  825. max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
  826. offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
  827. offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
  828. offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
  829. const int offset = offsets[0] + offsets[1] + offsets[2];
  830. struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
  831. check_gradient("acc 3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  832. }
  833. }
  834. // acc 4d
  835. {
  836. int64_t ne2[4] = { 1, 1, 1, 1 };
  837. int64_t max_offsets[4] = { 0, 0, 0, 0 };
  838. int64_t offsets[4] = { 0, 0, 0, 0 };
  839. const int nargs = 2;
  840. for (int ndims = 4; ndims <= 4; ++ndims) {
  841. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  842. ggml_set_param(ctx0, x[0]);
  843. get_random_dims(ne2, 4);
  844. while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[3] > ne[3]) || (ne2[0]*ne2[1]*ne2[2]*ne2[3] > ggml_nelements(x[0]))) {
  845. get_random_dims(ne2, 4);
  846. }
  847. x[1] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
  848. ggml_set_param(ctx0, x[1]);
  849. max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
  850. max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
  851. max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
  852. max_offsets[3] = MAX(0, x[0]->ne[3] - x[1]->ne[3]);
  853. offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
  854. offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
  855. offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
  856. offsets[3] = irand(max_offsets[3]) * x[0]->nb[3];
  857. const int offset = offsets[0] + offsets[1] + offsets[2] + offsets[3];
  858. struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
  859. check_gradient("acc 4d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  860. }
  861. }
  862. // set_1d
  863. {
  864. int64_t ne2[4];
  865. const int nargs = 2;
  866. for (int ndims = 1; ndims <= 4; ++ndims) {
  867. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  868. ggml_set_param(ctx0, x[0]);
  869. get_random_dims(ne2, 1);
  870. while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
  871. get_random_dims(ne2, 1);
  872. }
  873. x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
  874. ggml_set_param(ctx0, x[1]);
  875. const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
  876. const int offset = irand(max_offset) * ggml_element_size(x[0]);
  877. struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_1d(ctx0, x[0], x[1], offset));
  878. check_gradient("set_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  879. }
  880. }
  881. // set_2d
  882. {
  883. int64_t ne2[4];
  884. int64_t max_offsets[4] = { 0, 0, 0, 0 };
  885. int64_t offsets[4] = { 0, 0, 0, 0 };
  886. const int nargs = 1;
  887. for (int ndims = 2; ndims <= 4; ++ndims) {
  888. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  889. ggml_set_param(ctx0, x[0]);
  890. get_random_dims(ne2, 2);
  891. while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
  892. get_random_dims(ne2, 2);
  893. }
  894. x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
  895. ggml_set_param(ctx0, x[1]);
  896. max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
  897. max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
  898. offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
  899. offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
  900. const int offset = offsets[0] + offsets[1];
  901. struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_2d(ctx0, x[0], x[1], x[1]->nb[1], offset));
  902. check_gradient("set_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  903. }
  904. }
  905. // view_1d
  906. {
  907. const int nargs = 1;
  908. for (int ndims = 1; ndims <= 4; ++ndims) {
  909. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  910. ggml_set_param(ctx0, x[0]);
  911. const int k0 = irand(ggml_nelements(x[0]));
  912. const int k1 = irand(ggml_nelements(x[0]));
  913. const int i0 = MIN(k0, k1);
  914. const int i1 = MAX(k0, k1);
  915. const int offset = i0 * sizeof(float);
  916. const int nelem = i1 - i0;
  917. struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_1d(ctx0, x[0], nelem, offset));
  918. check_gradient("view_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  919. }
  920. }
  921. // view_2d
  922. {
  923. int64_t ne2[4];
  924. int64_t nb2[4];
  925. const int nargs = 1;
  926. for (int ndims = 1; ndims <= 4; ++ndims) {
  927. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  928. get_random_dims(ne2, 2);
  929. while (ne2[0]*ne2[1] > ggml_nelements(x[0])) {
  930. get_random_dims(ne2, 2);
  931. }
  932. const int count = ne2[0]*ne2[1];
  933. nb2[0] = sizeof(float);
  934. nb2[1] = nb2[0]*ne2[0];
  935. ggml_set_param(ctx0, x[0]);
  936. const int max_offset = ggml_nelements(x[0]) - count;
  937. const int offset = irand(max_offset+1) * sizeof(float);
  938. struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_2d(ctx0, x[0], ne2[0], ne2[1], nb2[1], offset));
  939. check_gradient("view_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  940. }
  941. }
  942. // view_3d
  943. {
  944. int64_t ne2[4] = {1,1,1,1};
  945. int64_t nb2[4] = {0,0,0,0};
  946. const int nargs = 1;
  947. for (int ndims = 1; ndims <= 4; ++ndims) {
  948. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  949. get_random_dims(ne2, 3);
  950. while (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0])) {
  951. get_random_dims(ne2, 3);
  952. }
  953. const int count = ne2[0]*ne2[1]*ne2[2];
  954. nb2[0] = sizeof(float);
  955. nb2[1] = nb2[0]*ne2[0];
  956. nb2[2] = nb2[1]*ne2[1];
  957. ggml_set_param(ctx0, x[0]);
  958. const int max_offset = ggml_nelements(x[0]) - count;
  959. const int offset = irand(max_offset+1) * sizeof(float);
  960. struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_3d(ctx0, x[0], ne2[0], ne2[1], ne2[2], nb2[1], nb2[2], offset));
  961. check_gradient("view_3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  962. }
  963. }
  964. // permute
  965. {
  966. int64_t ne2[4];
  967. const int nargs = 1;
  968. for (int ndims = 1; ndims <= 4; ++ndims)
  969. {
  970. // ggml_permute will set axes of dimensions below n_dims to 1.
  971. // to make ggml_permute work correctly on all axes,
  972. // the input tensor needs maximal n_dim of 4.
  973. for (int i=0; i<ndims; ++i) {
  974. ne2[i] = ne[i];
  975. }
  976. for (int i=ndims; i<4; ++i) {
  977. ne2[i] = 1;
  978. }
  979. x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
  980. ggml_set_param(ctx0, x[0]);
  981. const int p = irand(NUM_PERMUTATIONS);
  982. const int ax0 = all_permutations[p*4+0];
  983. const int ax1 = all_permutations[p*4+1];
  984. const int ax2 = all_permutations[p*4+2];
  985. const int ax3 = all_permutations[p*4+3];
  986. // sum requires contiguous tensor rows
  987. struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, x[0], ax0, ax1, ax2, ax3)));
  988. check_gradient("permute", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  989. }
  990. }
  991. // transpose
  992. {
  993. int64_t ne2[4];
  994. const int nargs = 1;
  995. for (int ndims = 1; ndims <= 4; ++ndims)
  996. {
  997. // ggml_transpose will set axes of dimensions below n_dims to 1.
  998. // to make ggml_transpose work correctly on all axes,
  999. // the input tensor needs maximal n_dim of 4.
  1000. for (int i=0; i<ndims; ++i) {
  1001. ne2[i] = ne[i];
  1002. }
  1003. for (int i=ndims; i<4; ++i) {
  1004. ne2[i] = 1;
  1005. }
  1006. x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
  1007. ggml_set_param(ctx0, x[0]);
  1008. // sum requires contiguous tensor rows
  1009. struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, x[0])));
  1010. check_gradient("transpose", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  1011. }
  1012. }
  1013. // get_rows
  1014. {
  1015. int64_t ne2[4] = {ne[0], ne[1], 1, 1};
  1016. int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
  1017. const int nargs = 1;
  1018. const int ndims = 2;
  1019. x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  1020. x[1] = get_random_tensor_i32(ctx0, 1, ne3, 0, ne2[1]);
  1021. ggml_set_param(ctx0, x[0]);
  1022. struct ggml_tensor * f = ggml_sum(ctx0, ggml_get_rows(ctx0, x[0], x[1]));
  1023. check_gradient("get_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  1024. }
  1025. // diag_mask_inf
  1026. {
  1027. const int nargs = 1;
  1028. const int ndims = 2;
  1029. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  1030. ggml_set_param(ctx0, x[0]);
  1031. int n_past = irand(ne[0]);
  1032. struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_inf(ctx0, x[0], n_past));
  1033. check_gradient("diag_mask_inf", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  1034. }
  1035. // diag_mask_zero
  1036. {
  1037. const int nargs = 1;
  1038. const int ndims = 2;
  1039. x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
  1040. ggml_set_param(ctx0, x[0]);
  1041. int n_past = irand(ne[0]);
  1042. struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_zero(ctx0, x[0], n_past));
  1043. check_gradient("diag_mask_zero", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  1044. }
  1045. // softmax
  1046. {
  1047. const int nargs = 1;
  1048. int64_t ne2[4];
  1049. get_random_dims(ne2, 4);
  1050. for (int ndims = 1; ndims <= 3; ++ndims) {
  1051. x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  1052. ggml_set_param(ctx0, x[0]);
  1053. struct ggml_tensor * f = ggml_sum(ctx0, ggml_soft_max(ctx0, x[0]));
  1054. check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
  1055. }
  1056. }
  1057. // cross_entropy_loss
  1058. {
  1059. const int nargs = 1;
  1060. int64_t ne2[4];
  1061. get_random_dims(ne2, 4);
  1062. for (int ndims = 1; ndims <= 3; ++ndims) {
  1063. x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  1064. x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
  1065. ggml_set_param(ctx0, x[0]);
  1066. struct ggml_tensor * f = ggml_sum(ctx0, ggml_cross_entropy_loss(ctx0, x[0], x[1]));
  1067. check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-1f, 1e-2f, INFINITY);
  1068. // finite differences regularly fails!
  1069. }
  1070. }
  1071. // rope f32
  1072. {
  1073. const int nargs = 1;
  1074. int64_t ne2[4];
  1075. get_random_dims(ne2, 4);
  1076. ne2[0] += ne2[0] % 2;
  1077. int n_rot = ne2[0];
  1078. for (int ndims = 3; ndims <= 4; ++ndims) {
  1079. for (int mode = 0; mode < 4; ++mode) {
  1080. for (int n_past = 1; n_past < ne2[2]; ++n_past) {
  1081. x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
  1082. ggml_set_param(ctx0, x[0]);
  1083. const bool skip_past = (mode & 1);
  1084. if (skip_past) {
  1085. // we have no past, so this would have to work on uninitialized memory.
  1086. // we only test the gradients here;
  1087. // skip_past should have no influence on gradient computation.
  1088. // so when other modes work, we assume that this does as well.
  1089. continue;
  1090. }
  1091. struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));
  1092. GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
  1093. check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
  1094. }
  1095. }
  1096. }
  1097. }
  1098. // rope f16
  1099. {
  1100. const int nargs = 1;
  1101. int64_t ne2[4];
  1102. get_random_dims(ne2, 4);
  1103. ne2[0] += ne2[0] % 2;
  1104. int n_rot = ne2[0];
  1105. for (int ndims = 3; ndims <= 4; ++ndims) {
  1106. for (int mode = 0; mode < 4; ++mode) {
  1107. for (int n_past = 1; n_past < ne2[2]; ++n_past) {
  1108. x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);
  1109. ggml_set_param(ctx0, x[0]);
  1110. const bool skip_past = (mode & 1);
  1111. if (skip_past) {
  1112. // we have no past, so this would have to work on uninitialized memory.
  1113. // we only test the gradients here;
  1114. // skip_past should have no influence on gradient computation.
  1115. // so when other modes work, we assume that this does as well.
  1116. continue;
  1117. }
  1118. struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));
  1119. GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
  1120. check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
  1121. }
  1122. }
  1123. }
  1124. }
  1125. // flash_attn f32
  1126. {
  1127. const int nargs = 3;
  1128. int64_t ne2[4];
  1129. get_random_dims(ne2, 4);
  1130. int64_t D = ne2[0];
  1131. int64_t N = ne2[1];
  1132. int64_t M = ne2[2] + N;
  1133. int64_t B = ne2[3];
  1134. for (int masked = 0; masked <= 1; ++masked) {
  1135. for (int ndims = 2; ndims <= 4; ++ndims) {
  1136. int64_t neq[4] = { D, N, B, ne[3] };
  1137. int64_t nek[4] = { D, M, B, ne[3] };
  1138. int64_t nev[4] = { M, D, B, ne[3] };
  1139. if (ndims == 2) {
  1140. neq[2] = 1; neq[3] = 1;
  1141. nek[2] = 1; nek[3] = 1;
  1142. nev[2] = 1; nev[3] = 1;
  1143. } else if (ndims == 3) {
  1144. neq[3] = 1;
  1145. nek[3] = 1;
  1146. nev[3] = 1;
  1147. }
  1148. x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
  1149. x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
  1150. x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
  1151. ggml_set_param(ctx0, x[0]);
  1152. ggml_set_param(ctx0, x[1]);
  1153. ggml_set_param(ctx0, x[2]);
  1154. struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
  1155. check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
  1156. }
  1157. }
  1158. }
  1159. // flash_attn f16, not yet fully implemented
  1160. if(0)
  1161. {
  1162. const int nargs = 3;
  1163. int64_t ne2[4];
  1164. get_random_dims(ne2, 4);
  1165. int64_t D = ne2[0];
  1166. int64_t N = ne2[1];
  1167. int64_t M = ne2[2] + N;
  1168. int64_t B = ne2[3];
  1169. for (int masked = 0; masked <= 1; ++masked) {
  1170. for (int ndims = 2; ndims <= 4; ++ndims) {
  1171. int64_t neq[4] = { D, N, B, ne[3] };
  1172. int64_t nek[4] = { D, M, B, ne[3] };
  1173. int64_t nev[4] = { M, D, B, ne[3] };
  1174. if (ndims == 2) {
  1175. neq[2] = 1; neq[3] = 1;
  1176. nek[2] = 1; nek[3] = 1;
  1177. nev[2] = 1; nev[3] = 1;
  1178. } else if (ndims == 3) {
  1179. neq[3] = 1;
  1180. nek[3] = 1;
  1181. nev[3] = 1;
  1182. }
  1183. x[0] = get_random_tensor_f16(ctx0, ndims, neq, -0.1250f, 0.1250f);
  1184. x[1] = get_random_tensor_f16(ctx0, ndims, nek, -0.1250f, 0.1250f);
  1185. x[2] = get_random_tensor_f16(ctx0, ndims, nev, -0.1250f, 0.1250f);
  1186. ggml_set_param(ctx0, x[0]);
  1187. ggml_set_param(ctx0, x[1]);
  1188. ggml_set_param(ctx0, x[2]);
  1189. struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
  1190. check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
  1191. }
  1192. }
  1193. }
  1194. ggml_free(ctx0);
  1195. }
  1196. return 0;
  1197. }