test-backend-ops.cpp 60 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773
  1. #include <ggml.h>
  2. #include <ggml-alloc.h>
  3. #include <ggml-backend.h>
  4. #include <ggml-backend-impl.h>
  5. #include <algorithm>
  6. #include <array>
  7. #include <cfloat>
  8. #include <cstring>
  9. #include <functional>
  10. #include <memory>
  11. #include <random>
  12. #include <stdio.h>
  13. #include <stdlib.h>
  14. #include <string>
  15. #include <thread>
  16. #include <vector>
  17. static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
  18. // static RNG initialization (revisit if n_threads stops being constant)
  19. static const size_t n_threads = std::thread::hardware_concurrency();
  20. static std::vector<std::default_random_engine> generators = []() {
  21. std::random_device rd;
  22. std::vector<std::default_random_engine> vec;
  23. vec.reserve(n_threads);
  24. //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
  25. for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
  26. return vec;
  27. }();
  28. size_t size = ggml_nelements(tensor);
  29. std::vector<float> data(size);
  30. auto init_thread = [&](size_t ith, size_t start, size_t end) {
  31. std::uniform_real_distribution<float> distribution(min, max);
  32. for (size_t i = start; i < end; i++) {
  33. data[i] = distribution(generators[ith]);
  34. }
  35. };
  36. std::vector<std::thread> threads;
  37. threads.reserve(n_threads);
  38. for (size_t i = 0; i < n_threads; i++) {
  39. size_t start = i*size/n_threads;
  40. size_t end = (i+1)*size/n_threads;
  41. threads.emplace_back(init_thread, i, start, end);
  42. }
  43. for (auto & t : threads) {
  44. t.join();
  45. }
  46. if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
  47. ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
  48. } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) {
  49. GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
  50. std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
  51. int64_t hist[16];
  52. std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
  53. const float * im = imatrix.data();
  54. if (!ggml_quantize_requires_imatrix(tensor->type)) {
  55. // when the imatrix is optional, we want to test both quantization with and without imatrix
  56. // use one of the random numbers to decide
  57. if (data[0] > 0.5f*(min + max)) {
  58. im = nullptr;
  59. }
  60. }
  61. ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], hist, im);
  62. ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
  63. } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
  64. // This is going to create some weird integers though.
  65. ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
  66. } else {
  67. GGML_ASSERT(false);
  68. }
  69. }
  70. static std::vector<float> tensor_to_float(const ggml_tensor * t) {
  71. std::vector<float> tv;
  72. tv.reserve(ggml_nelements(t));
  73. std::vector<uint8_t> buf(ggml_nbytes(t));
  74. ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
  75. ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
  76. size_t bs = ggml_blck_size(t->type);
  77. std::vector<float> vq(ggml_blck_size(t->type));
  78. bool quantized = ggml_is_quantized(t->type);
  79. // access elements by index to avoid gaps in views
  80. for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
  81. for (int64_t i2 = 0; i2 < t->ne[2]; i2++) {
  82. for (int64_t i1 = 0; i1 < t->ne[1]; i1++) {
  83. for (int64_t i0 = 0; i0 < t->ne[0]; i0 += bs) {
  84. size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0];
  85. if (t->type == GGML_TYPE_F16) {
  86. tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]));
  87. } else if (t->type == GGML_TYPE_F32) {
  88. tv.push_back(*(float *) &buf[i]);
  89. } else if (t->type == GGML_TYPE_I32) {
  90. tv.push_back((float)*(int32_t *) &buf[i]);
  91. } else if (t->type == GGML_TYPE_I16) {
  92. tv.push_back((float)*(int16_t *) &buf[i]);
  93. } else if (t->type == GGML_TYPE_I8) {
  94. tv.push_back((float)*(int8_t *) &buf[i]);
  95. } else if (quantized) {
  96. tt.to_float(&buf[i], vq.data(), ggml_blck_size(t->type));
  97. tv.insert(tv.end(), vq.begin(), vq.end());
  98. } else {
  99. GGML_ASSERT(false);
  100. }
  101. }
  102. }
  103. }
  104. }
  105. return tv;
  106. }
  107. /*
  108. static double cosine_similarity(const float * v1, const float * v2, size_t n) {
  109. double dot = 0.0;
  110. double mag1 = 0.0;
  111. double mag2 = 0.0;
  112. for (size_t i = 0; i < n; i++) {
  113. if (std::isnan(v1[i]) || std::isnan(v2[i])) {
  114. return -1.0f;
  115. }
  116. if (std::isinf(v1[i]) && std::isinf(v2[i])) {
  117. continue;
  118. }
  119. dot += v1[i]*v2[i];
  120. mag1 += v1[i]*v1[i];
  121. mag2 += v2[i]*v2[i];
  122. }
  123. return dot/sqrt(mag1*mag2);
  124. }
  125. static float distance(const float * v1, const float * v2, size_t n) {
  126. double d = 0.0;
  127. for (size_t i = 0; i < n; i++) {
  128. if (std::isnan(v1[i]) || std::isnan(v2[i])) {
  129. return INFINITY;
  130. }
  131. if (std::isinf(v1[i]) && std::isinf(v2[i])) {
  132. continue;
  133. }
  134. d += (v1[i] - v2[i])*(v1[i] - v2[i]);
  135. }
  136. return sqrt(d);
  137. }
  138. static float vec_len(const float * v, size_t n) {
  139. double d = 0.0;
  140. for (size_t i = 0; i < n; i++) {
  141. if (std::isnan(v[i])) {
  142. return INFINITY;
  143. }
  144. if (std::isinf(v[i])) {
  145. continue;
  146. }
  147. d += v[i]*v[i];
  148. }
  149. return sqrt(d);
  150. }
  151. */
  152. // normalized mean squared error = mse(a, b) / mse(a, 0)
  153. static double nmse(const float * a, const float * b, size_t n) {
  154. double mse_a_b = 0.0;
  155. double mse_a_0 = 0.0;
  156. for (size_t i = 0; i < n; i++) {
  157. float a_i = a[i];
  158. float b_i = b[i];
  159. mse_a_b += (a_i - b_i) * (a_i - b_i);
  160. mse_a_0 += a_i * a_i;
  161. }
  162. return mse_a_b / mse_a_0;
  163. }
  164. // utils for printing the variables of the test cases
  165. #define VAR_TO_STR(x) (#x "=" + var_to_str(x))
  166. template<typename T>
  167. static std::string var_to_str(const T & x) {
  168. return std::to_string(x);
  169. }
  170. template<typename T, size_t N>
  171. static std::string var_to_str(const T (&x)[N]) {
  172. std::string s = "[";
  173. for (size_t i = 0; i < N; i++) {
  174. if (i > 0) {
  175. s += ",";
  176. }
  177. s += var_to_str(x[i]);
  178. }
  179. s += "]";
  180. return s;
  181. }
  182. template<typename T, size_t N>
  183. static std::string var_to_str(const std::array<T, N> & x) {
  184. std::string s = "[";
  185. for (size_t i = 0; i < N; i++) {
  186. if (i > 0) {
  187. s += ",";
  188. }
  189. s += var_to_str(x[i]);
  190. }
  191. s += "]";
  192. return s;
  193. }
  194. //static std::string var_to_str(ggml_unary_op unary_op) {
  195. // return ggml_unary_op_name(unary_op);
  196. //}
  197. static std::string var_to_str(ggml_type type) {
  198. return ggml_type_name(type);
  199. }
  200. #define VARS_TO_STR1(a) VAR_TO_STR(a)
  201. #define VARS_TO_STR2(a, b) VAR_TO_STR(a) + "," + VAR_TO_STR(b)
  202. #define VARS_TO_STR3(a, b, c) VAR_TO_STR(a) + "," + VARS_TO_STR2(b, c)
  203. #define VARS_TO_STR4(a, b, c, d) VAR_TO_STR(a) + "," + VARS_TO_STR3(b, c, d)
  204. #define VARS_TO_STR5(a, b, c, d, e) VAR_TO_STR(a) + "," + VARS_TO_STR4(b, c, d, e)
  205. #define VARS_TO_STR6(a, b, c, d, e, f) VAR_TO_STR(a) + "," + VARS_TO_STR5(b, c, d, e, f)
  206. #define VARS_TO_STR7(a, b, c, d, e, f, g) VAR_TO_STR(a) + "," + VARS_TO_STR6(b, c, d, e, f, g)
  207. #define VARS_TO_STR8(a, b, c, d, e, f, g, h) VAR_TO_STR(a) + "," + VARS_TO_STR7(b, c, d, e, f, g, h)
  208. #define VARS_TO_STR9(a, b, c, d, e, f, g, h, i) VAR_TO_STR(a) + "," + VARS_TO_STR8(b, c, d, e, f, g, h, i)
  209. #define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
  210. #define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)
  211. #ifdef GGML_USE_SYCL
  212. static bool inline _isinf(float f) {
  213. return (*(uint32_t *)&f & 0x7fffffff) == 0x7f800000;
  214. }
  215. #else
  216. static bool inline _isinf(float f) { return std::isinf(f); }
  217. #endif
  218. // accept FLT_MAX as infinity
  219. static bool isinf_or_max(float f) {
  220. return _isinf(f) || f == FLT_MAX || f == -FLT_MAX;
  221. }
  222. static bool ggml_is_view_op(enum ggml_op op) {
  223. return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
  224. }
  225. enum test_mode {
  226. MODE_TEST,
  227. MODE_PERF,
  228. };
  229. struct test_case {
  230. virtual ~test_case() {}
  231. virtual std::string op_desc(ggml_tensor * t) {
  232. return ggml_op_desc(t);
  233. }
  234. virtual std::string vars() {
  235. return "";
  236. }
  237. virtual ggml_tensor * build_graph(ggml_context * ctx) = 0;
  238. virtual double max_nmse_err() {
  239. return 1e-7;
  240. }
  241. virtual void initialize_tensors(ggml_context * ctx) {
  242. for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
  243. init_tensor_uniform(t);
  244. }
  245. }
  246. virtual size_t op_size(ggml_tensor * t) {
  247. size_t size = ggml_nbytes(t);
  248. // add source tensors
  249. for (int i = 0; i < GGML_MAX_SRC; i++) {
  250. if (t->src[i] != NULL) {
  251. size += ggml_nbytes(t->src[i]);
  252. }
  253. }
  254. return size;
  255. }
  256. ggml_cgraph * gf = nullptr;
  257. static const int sentinel_size = 1024;
  258. test_mode mode;
  259. std::vector<ggml_tensor *> sentinels;
  260. void add_sentinel(ggml_context * ctx) {
  261. if (mode == MODE_PERF) {
  262. return;
  263. }
  264. ggml_tensor * sentinel = ::ggml_new_tensor_1d(ctx, GGML_TYPE_F32, sentinel_size);
  265. ggml_format_name(sentinel, "sent_%zu", sentinels.size());
  266. sentinels.push_back(sentinel);
  267. }
  268. // hijack ggml_new_tensor to add sentinels after each tensor to check for overflows in the backend
  269. ggml_tensor * ggml_new_tensor(ggml_context * ctx, ggml_type type, int n_dims, const int64_t * ne) {
  270. ggml_tensor * t = ::ggml_new_tensor(ctx, type, n_dims, ne);
  271. add_sentinel(ctx);
  272. return t;
  273. }
  274. ggml_tensor * ggml_new_tensor_1d(ggml_context * ctx, ggml_type type, int64_t ne0) {
  275. ggml_tensor * t = ::ggml_new_tensor_1d(ctx, type, ne0);
  276. add_sentinel(ctx);
  277. return t;
  278. }
  279. ggml_tensor * ggml_new_tensor_2d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1) {
  280. ggml_tensor * t = ::ggml_new_tensor_2d(ctx, type, ne0, ne1);
  281. add_sentinel(ctx);
  282. return t;
  283. }
  284. ggml_tensor * ggml_new_tensor_3d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2) {
  285. ggml_tensor * t = ::ggml_new_tensor_3d(ctx, type, ne0, ne1, ne2);
  286. add_sentinel(ctx);
  287. return t;
  288. }
  289. ggml_tensor * ggml_new_tensor_4d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
  290. ggml_tensor * t = ::ggml_new_tensor_4d(ctx, type, ne0, ne1, ne2, ne3);
  291. add_sentinel(ctx);
  292. return t;
  293. }
  294. bool eval(ggml_backend_t backend1, ggml_backend_t backend2, const char * op_name) {
  295. mode = MODE_TEST;
  296. ggml_init_params params = {
  297. /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
  298. /* .mem_base = */ NULL,
  299. /* .no_alloc = */ true,
  300. };
  301. ggml_context * ctx = ggml_init(params);
  302. gf = ggml_new_graph(ctx);
  303. // pre-graph sentinel
  304. add_sentinel(ctx);
  305. ggml_tensor * out = build_graph(ctx);
  306. if (op_name != nullptr && op_desc(out) != op_name) {
  307. //printf(" %s: skipping\n", op_desc(out).c_str());
  308. ggml_free(ctx);
  309. return true;
  310. }
  311. printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
  312. fflush(stdout);
  313. // check if backends support op
  314. bool supported = true;
  315. for (ggml_backend_t backend : {backend1, backend2}) {
  316. if (!ggml_backend_supports_op(backend, out)) {
  317. printf("not supported [%s] ", ggml_backend_name(backend));
  318. supported = false;
  319. }
  320. }
  321. if (!supported) {
  322. printf("\n");
  323. ggml_free(ctx);
  324. return true;
  325. }
  326. // post-graph sentinel
  327. add_sentinel(ctx);
  328. // allocate
  329. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1);
  330. if (buf == NULL) {
  331. printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1));
  332. ggml_free(ctx);
  333. return false;
  334. }
  335. // build graph
  336. ggml_build_forward_expand(gf, out);
  337. // add sentinels as graph nodes so that they are checked in the callback
  338. for (ggml_tensor * sentinel : sentinels) {
  339. gf->nodes[gf->n_nodes++] = sentinel;
  340. }
  341. // randomize tensors
  342. initialize_tensors(ctx);
  343. // compare
  344. struct callback_userdata {
  345. bool ok;
  346. double max_err;
  347. ggml_backend_t backend1;
  348. ggml_backend_t backend2;
  349. };
  350. callback_userdata ud {
  351. true,
  352. max_nmse_err(),
  353. backend1,
  354. backend2
  355. };
  356. auto callback = [](int index, ggml_tensor * t1, ggml_tensor * t2, void * user_data) -> bool {
  357. callback_userdata * ud = (callback_userdata *) user_data;
  358. const char * bn1 = ggml_backend_name(ud->backend1);
  359. const char * bn2 = ggml_backend_name(ud->backend2);
  360. if (t1->op == GGML_OP_NONE) {
  361. // sentinels must be unchanged
  362. std::vector<uint8_t> t1_data(ggml_nbytes(t1));
  363. std::vector<uint8_t> t2_data(ggml_nbytes(t2));
  364. ggml_backend_tensor_get(t1, t1_data.data(), 0, ggml_nbytes(t1));
  365. ggml_backend_tensor_get(t2, t2_data.data(), 0, ggml_nbytes(t2));
  366. if (memcmp(t1_data.data(), t2_data.data(), ggml_nbytes(t1)) != 0) {
  367. printf("sentinel mismatch: %s ", t1->name);
  368. ud->ok = false;
  369. return true;
  370. }
  371. }
  372. std::vector<float> f1 = tensor_to_float(t1);
  373. std::vector<float> f2 = tensor_to_float(t2);
  374. for (size_t i = 0; i < f1.size(); i++) {
  375. // check for nans
  376. if (std::isnan(f1[i]) || std::isnan(f2[i])) {
  377. printf("[%s] NaN at index %zu (%s=%f %s=%f) ", ggml_op_desc(t1), i, bn1, f1[i], bn2, f2[i]);
  378. ud->ok = false;
  379. return true;
  380. }
  381. // check for infs: both must be inf of the same sign, or both must be finite
  382. if (isinf_or_max(f1[i]) || isinf_or_max(f2[i])) {
  383. if (isinf_or_max(f1[i]) && isinf_or_max(f2[i])) {
  384. if (std::signbit(f1[i]) != std::signbit(f2[i])) {
  385. printf("[%s] inf sign mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
  386. ud->ok = false;
  387. return true;
  388. }
  389. } else {
  390. printf("[%s] inf mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
  391. ud->ok = false;
  392. return true;
  393. }
  394. }
  395. }
  396. double err = nmse(f1.data(), f2.data(), f1.size());
  397. if (err > ud->max_err) {
  398. printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err);
  399. //for (int i = 0; i < (int) f1.size(); i++) {
  400. // printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
  401. //}
  402. //printf("\n");
  403. //exit(1);
  404. ud->ok = false;
  405. }
  406. return true;
  407. GGML_UNUSED(index);
  408. };
  409. const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud);
  410. if (!cmp_ok) {
  411. printf("compare failed ");
  412. }
  413. ggml_backend_buffer_free(buf);
  414. ggml_free(ctx);
  415. if (ud.ok && cmp_ok) {
  416. printf("\033[1;32mOK\033[0m\n");
  417. return true;
  418. }
  419. printf("\033[1;31mFAIL\033[0m\n");
  420. return false;
  421. }
  422. bool eval_perf(ggml_backend_t backend, const char * op_name) {
  423. mode = MODE_PERF;
  424. static const size_t graph_nodes = 8192;
  425. ggml_init_params params = {
  426. /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead_custom(graph_nodes, false),
  427. /* .mem_base = */ NULL,
  428. /* .no_alloc = */ true,
  429. };
  430. ggml_context * ctx = ggml_init(params);
  431. ggml_tensor * out = build_graph(ctx);
  432. if (op_name != nullptr && op_desc(out) != op_name) {
  433. //printf(" %s: skipping\n", op_desc(out).c_str());
  434. ggml_free(ctx);
  435. return true;
  436. }
  437. int len = printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
  438. fflush(stdout);
  439. // check if backends support op
  440. if (!ggml_backend_supports_op(backend, out)) {
  441. printf("not supported\n");
  442. ggml_free(ctx);
  443. return true;
  444. }
  445. // align while also leaving some margin for variations in parameters
  446. int align = 20;
  447. int last = (len + align - 1) / align * align;
  448. if (last - len < 5) {
  449. last += align;
  450. }
  451. last = std::max(last, 60);
  452. printf("%*s", last - len, "");
  453. // allocate
  454. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
  455. if (buf == NULL) {
  456. printf("failed to allocate tensors\n");
  457. ggml_free(ctx);
  458. return false;
  459. }
  460. // randomize tensors
  461. initialize_tensors(ctx);
  462. // build graph
  463. ggml_cgraph * gf = ggml_new_graph_custom(ctx, graph_nodes, false);
  464. ggml_build_forward_expand(gf, out);
  465. // warmup run
  466. ggml_backend_graph_compute(backend, gf);
  467. // duplicate the op
  468. size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
  469. int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1;
  470. for (int i = 1; i < n_runs; i++) {
  471. gf->nodes[gf->n_nodes++] = out;
  472. }
  473. // calculate memory
  474. size_t mem = n_runs * op_size(out);
  475. auto tensor_op_size = [](ggml_tensor * t) {
  476. size_t size = ggml_nbytes(t);
  477. // add source tensors
  478. for (int i = 0; i < GGML_MAX_SRC; i++) {
  479. if (t->src[i] != NULL) {
  480. size += ggml_nbytes(t->src[i]);
  481. }
  482. }
  483. return size;
  484. };
  485. for (int i = 0; i < gf->n_nodes; i++) {
  486. if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) {
  487. continue;
  488. }
  489. mem += tensor_op_size(gf->nodes[i]);
  490. }
  491. // run
  492. ggml_backend_synchronize(backend);
  493. int64_t start_time = ggml_time_us();
  494. ggml_backend_graph_compute(backend, gf);
  495. ggml_backend_synchronize(backend);
  496. int64_t end_time = ggml_time_us();
  497. double time_us = end_time - start_time;
  498. printf(" %5d runs - %8.2f us/run - %8zu kB/run - \033[1;34m%7.2f GB/s\033[0m\n",
  499. n_runs,
  500. time_us / n_runs,
  501. op_size(out) / 1024,
  502. mem / (time_us/1e6) / 1024.0 / 1024.0 / 1024.0);
  503. ggml_backend_buffer_free(buf);
  504. ggml_free(ctx);
  505. return true;
  506. }
  507. };
  508. // GGML_OP_UNARY
  509. struct test_unary : public test_case {
  510. const ggml_unary_op op;
  511. const ggml_type type;
  512. const std::array<int64_t, 4> ne;
  513. std::string vars() override {
  514. return VARS_TO_STR2(type, ne);
  515. }
  516. test_unary(ggml_unary_op op,
  517. ggml_type type = GGML_TYPE_F32,
  518. std::array<int64_t, 4> ne = {128, 10, 10, 10})
  519. : op(op), type(type), ne(ne) {}
  520. ggml_tensor * build_graph(ggml_context * ctx) override {
  521. ggml_tensor * in = ggml_new_tensor(ctx, type, 4, ne.data());
  522. ggml_tensor * out = ggml_unary(ctx, in, op);
  523. return out;
  524. }
  525. };
  526. // GGML_OP_GET_ROWS
  527. struct test_get_rows : public test_case {
  528. const ggml_type type;
  529. const int n; // cols
  530. const int m; // rows
  531. const int r; // rows to get
  532. const int b; // batch size
  533. const bool v; // view (non-contiguous src1)
  534. std::string vars() override {
  535. return VARS_TO_STR6(type, n, m, r, b, v);
  536. }
  537. test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1, bool v = false)
  538. : type(type), n(n), m(m), r(r), b(b), v(v) {}
  539. ggml_tensor * build_graph(ggml_context * ctx) override {
  540. ggml_tensor * in = ggml_new_tensor_3d(ctx, type, n, m, b);
  541. ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b);
  542. if (v) {
  543. rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0);
  544. }
  545. ggml_tensor * out = ggml_get_rows(ctx, in, rows);
  546. return out;
  547. }
  548. void initialize_tensors(ggml_context * ctx) override {
  549. for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
  550. if (t->type == GGML_TYPE_I32) {
  551. if (ggml_is_view_op(t->op)) { continue; }
  552. // rows
  553. std::vector<int> data(r*b);
  554. for (int i = 0; i < r*b; i++) {
  555. data[i] = rand() % m;
  556. }
  557. ggml_backend_tensor_set(t, data.data(), 0, r * b * sizeof(int));
  558. } else {
  559. init_tensor_uniform(t);
  560. }
  561. }
  562. }
  563. };
  564. // GGML_OP_REPEAT
  565. struct test_repeat : public test_case {
  566. const ggml_type type;
  567. const std::array<int64_t, 4> ne;
  568. const std::array<int, 4> nr;
  569. std::string vars() override {
  570. return VARS_TO_STR3(type, ne, nr);
  571. }
  572. size_t op_size(ggml_tensor * t) override {
  573. return ggml_nbytes(t) * 2;
  574. }
  575. test_repeat(ggml_type type = GGML_TYPE_F32,
  576. std::array<int64_t, 4> ne = {10, 10, 10, 10},
  577. std::array<int, 4> nr = {2, 2, 2, 2})
  578. : type(type), ne(ne), nr(nr) {}
  579. ggml_tensor * build_graph(ggml_context * ctx) override {
  580. ggml_tensor * target = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
  581. ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
  582. ggml_tensor * out = ggml_repeat(ctx, src, target);
  583. return out;
  584. }
  585. };
  586. // GGML_OP_DUP
  587. struct test_dup : public test_case {
  588. const ggml_type type;
  589. const std::array<int64_t, 4> ne;
  590. const std::array<int64_t, 4> permute;
  591. bool _use_permute;
  592. std::string vars() override {
  593. std::string v = VARS_TO_STR2(type, ne);
  594. if (_use_permute) v += "," + VAR_TO_STR(permute);
  595. return v;
  596. }
  597. test_dup(ggml_type type = GGML_TYPE_F32,
  598. std::array<int64_t, 4> ne = {10, 10, 10, 1},
  599. std::array<int64_t, 4> permute = {0, 0, 0, 0})
  600. : type(type), ne(ne), permute(permute),
  601. _use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
  602. ggml_tensor * build_graph(ggml_context * ctx) override {
  603. ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
  604. if (_use_permute) {
  605. src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
  606. }
  607. ggml_tensor * out = ggml_dup(ctx, src);
  608. return out;
  609. }
  610. };
  611. // GGML_OP_CPY
  612. struct test_cpy : public test_case {
  613. const ggml_type type_src;
  614. const ggml_type type_dst;
  615. const std::array<int64_t, 4> ne;
  616. std::string vars() override {
  617. return VARS_TO_STR3(type_src, type_dst, ne);
  618. }
  619. size_t op_size(ggml_tensor * t) override {
  620. return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
  621. }
  622. test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
  623. std::array<int64_t, 4> ne = {10, 10, 10, 1})
  624. : type_src(type_src), type_dst(type_dst), ne(ne) {}
  625. ggml_tensor * build_graph(ggml_context * ctx) override {
  626. ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
  627. ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, ne.data());
  628. ggml_tensor * out = ggml_cpy(ctx, src, dst);
  629. return out;
  630. }
  631. };
  632. // GGML_OP_CONT
  633. struct test_cont : public test_case {
  634. const ggml_type type;
  635. const std::array<int64_t, 4> ne;
  636. std::string vars() override {
  637. return VARS_TO_STR2(type, ne);
  638. }
  639. test_cont(ggml_type type = GGML_TYPE_F32,
  640. std::array<int64_t, 4> ne = {10, 10, 10, 1})
  641. : type(type), ne(ne) {}
  642. ggml_tensor * build_graph(ggml_context * ctx) override {
  643. ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
  644. src = ggml_transpose(ctx, src);
  645. ggml_tensor * out = ggml_cont(ctx, src);
  646. return out;
  647. }
  648. };
  649. // GGML_OP_ADD
  650. // GGML_OP_MUL
  651. // GGML_OP_DIV
  652. struct test_bin_bcast : public test_case {
  653. using op_t = ggml_tensor * (*) (ggml_context *, ggml_tensor *, ggml_tensor *);
  654. op_t op;
  655. const ggml_type type;
  656. const std::array<int64_t, 4> ne;
  657. const std::array<int, 4> nr;
  658. std::string vars() override {
  659. return VARS_TO_STR3(type, ne, nr);
  660. }
  661. size_t op_size(ggml_tensor * t) override {
  662. return ggml_nbytes(t) * 3;
  663. }
  664. test_bin_bcast(op_t op, ggml_type type = GGML_TYPE_F32,
  665. std::array<int64_t, 4> ne = {10, 10, 1, 1},
  666. std::array<int, 4> nr = {1, 2, 1, 1})
  667. : op(op), type(type), ne(ne), nr(nr) {}
  668. ggml_tensor * build_graph(ggml_context * ctx) override {
  669. ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
  670. ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
  671. ggml_tensor * out = op(ctx, a, b);
  672. return out;
  673. }
  674. void initialize_tensors(ggml_context * ctx) override {
  675. for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
  676. if (op == ggml_div) {
  677. // avoid division by zero
  678. init_tensor_uniform(t, 1.0f, 2.0f);
  679. } else {
  680. init_tensor_uniform(t);
  681. }
  682. }
  683. }
  684. };
  685. // GGML_OP_SCALE
  686. struct test_scale : public test_case {
  687. const ggml_type type;
  688. const std::array<int64_t, 4> ne;
  689. float scale;
  690. std::string vars() override {
  691. return VARS_TO_STR3(type, ne, scale);
  692. }
  693. test_scale(ggml_type type = GGML_TYPE_F32,
  694. std::array<int64_t, 4> ne = {10, 10, 10, 10},
  695. float scale = 2.0f)
  696. : type(type), ne(ne), scale(scale) {}
  697. ggml_tensor * build_graph(ggml_context * ctx) override {
  698. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  699. ggml_tensor * out = ggml_scale(ctx, a, scale);
  700. return out;
  701. }
  702. };
  703. // GGML_OP_NORM
  704. struct test_norm : public test_case {
  705. const ggml_type type;
  706. const std::array<int64_t, 4> ne;
  707. float eps;
  708. std::string vars() override {
  709. return VARS_TO_STR3(type, ne, eps);
  710. }
  711. test_norm(ggml_type type = GGML_TYPE_F32,
  712. std::array<int64_t, 4> ne = {64, 10, 10, 10},
  713. float eps = 1e-6f)
  714. : type(type), ne(ne), eps(eps) {}
  715. ggml_tensor * build_graph(ggml_context * ctx) override {
  716. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  717. ggml_tensor * out = ggml_norm(ctx, a, eps);
  718. return out;
  719. }
  720. };
  721. // GGML_OP_RMS_NORM
  722. struct test_rms_norm : public test_case {
  723. const ggml_type type;
  724. const std::array<int64_t, 4> ne;
  725. float eps;
  726. std::string vars() override {
  727. return VARS_TO_STR3(type, ne, eps);
  728. }
  729. test_rms_norm(ggml_type type = GGML_TYPE_F32,
  730. std::array<int64_t, 4> ne = {64, 10, 10, 10},
  731. float eps = 1e-6f)
  732. : type(type), ne(ne), eps(eps) {}
  733. ggml_tensor * build_graph(ggml_context * ctx) override {
  734. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  735. ggml_tensor * out = ggml_rms_norm(ctx, a, eps);
  736. return out;
  737. }
  738. };
  739. // GGML_OP_MUL_MAT
  740. struct test_mul_mat : public test_case {
  741. const ggml_type type_a;
  742. const ggml_type type_b;
  743. const int64_t m;
  744. const int64_t n;
  745. const int64_t k;
  746. const std::array<int64_t, 2> bs; // dims 3 and 4
  747. const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
  748. std::string vars() override {
  749. return VARS_TO_STR7(type_a, type_b, m, n, k, bs, nr);
  750. }
  751. double max_nmse_err() override {
  752. return 5e-4;
  753. }
  754. size_t op_size(ggml_tensor * t) override {
  755. size_t a = ggml_nbytes(t->src[0]) * n * nr[0] * nr[1];
  756. size_t b = ggml_nbytes(t->src[1]) * m;
  757. size_t c = ggml_nbytes(t);
  758. return a + b + c;
  759. GGML_UNUSED(t);
  760. }
  761. test_mul_mat(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
  762. int64_t m = 32, int64_t n = 32, int64_t k = 32,
  763. std::array<int64_t, 2> bs = {10, 10},
  764. std::array<int64_t, 2> nr = {2, 2})
  765. : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr) {}
  766. ggml_tensor * build_graph(ggml_context * ctx) override {
  767. // C^T = A * B^T: (k, m) * (k, n) => (m, n)
  768. ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0] , bs[1]);
  769. ggml_tensor * b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
  770. ggml_tensor * out = ggml_mul_mat(ctx, a, b);
  771. return out;
  772. }
  773. };
  774. // GGML_OP_MUL_MAT_ID
  775. struct test_mul_mat_id : public test_case {
  776. const ggml_type type_a;
  777. const ggml_type type_b;
  778. const int n_mats;
  779. const int id;
  780. const int64_t m;
  781. const int64_t n;
  782. const int64_t k;
  783. const bool v; // view (non-contiguous ids)
  784. std::string vars() override {
  785. return VARS_TO_STR8(type_a, type_b, n_mats, id, m, n, k, v);
  786. }
  787. double max_nmse_err() override {
  788. return 5e-4;
  789. }
  790. size_t op_size(ggml_tensor * t) override {
  791. size_t a = ggml_nbytes(t->src[2]) * n;
  792. size_t b = ggml_nbytes(t->src[1]) * m;
  793. size_t c = ggml_nbytes(t);
  794. return a + b + c;
  795. GGML_UNUSED(t);
  796. }
  797. test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
  798. int n_mats = 2, int id = 0,
  799. int64_t m = 32, int64_t n = 32, int64_t k = 32, bool v = false)
  800. : type_a(type_a), type_b(type_b), n_mats(n_mats), id(id),
  801. m(m), n(n), k(k), v(v) {}
  802. ggml_tensor * build_graph(ggml_context * ctx) override {
  803. // C^T = A * B^T: (k, m) * (k, n) => (m, n)
  804. std::vector<ggml_tensor *> mats;
  805. for (int i = 0; i < n_mats; i++) {
  806. ggml_tensor * a = ggml_new_tensor_2d(ctx, type_a, k, m);
  807. mats.push_back(a);
  808. }
  809. ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
  810. if (v) {
  811. ids = ggml_view_2d(ctx, ids, n_mats/2, ids->ne[1], ids->nb[1], 0);
  812. }
  813. ggml_tensor * b = ggml_new_tensor_2d(ctx, type_b, k, n);
  814. ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), n_mats, ids, v ? id/2 : id, b);
  815. return out;
  816. }
  817. void initialize_tensors(ggml_context * ctx) override {
  818. std::random_device rd;
  819. std::default_random_engine rng(rd());
  820. for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
  821. if (t->type == GGML_TYPE_I32) {
  822. if (ggml_is_view_op(t->op)) { continue; }
  823. // ids
  824. for (int64_t r = 0; r < ggml_nrows(t); r++) {
  825. std::vector<int32_t> data(t->ne[0]);
  826. for (int i = 0; i < t->ne[0]; i++) {
  827. data[i] = i % n_mats;
  828. }
  829. std::shuffle(data.begin(), data.end(), rng);
  830. ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
  831. }
  832. } else {
  833. init_tensor_uniform(t);
  834. }
  835. }
  836. }
  837. };
  838. // GGML_OP_SQR
  839. struct test_sqr : public test_case {
  840. const ggml_type type;
  841. const std::array<int64_t, 4> ne;
  842. std::string vars() override {
  843. return VARS_TO_STR2(type, ne);
  844. }
  845. test_sqr(ggml_type type = GGML_TYPE_F32,
  846. std::array<int64_t, 4> ne = {10, 10, 10, 10})
  847. : type(type), ne(ne) {}
  848. ggml_tensor * build_graph(ggml_context * ctx) override {
  849. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  850. ggml_tensor * out = ggml_sqr(ctx, a);
  851. return out;
  852. }
  853. };
  854. // GGML_OP_CLAMP
  855. struct test_clamp : public test_case {
  856. const ggml_type type;
  857. const std::array<int64_t, 4> ne;
  858. float min;
  859. float max;
  860. std::string vars() override {
  861. return VARS_TO_STR4(type, ne, min, max);
  862. }
  863. test_clamp(ggml_type type = GGML_TYPE_F32,
  864. std::array<int64_t, 4> ne = {10, 10, 10, 10},
  865. float min = -0.5f, float max = 0.5f)
  866. : type(type), ne(ne), min(min), max(max) {}
  867. ggml_tensor * build_graph(ggml_context * ctx) override {
  868. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  869. ggml_tensor * out = ggml_clamp(ctx, a, min, max);
  870. return out;
  871. }
  872. };
  873. // GGML_OP_DIAG_MASK_INF
  874. struct test_diag_mask_inf : public test_case {
  875. const ggml_type type;
  876. const std::array<int64_t, 4> ne;
  877. const int n_past;
  878. std::string vars() override {
  879. return VARS_TO_STR3(type, ne, n_past);
  880. }
  881. test_diag_mask_inf(ggml_type type = GGML_TYPE_F32,
  882. std::array<int64_t, 4> ne = {10, 10, 10, 10},
  883. int n_past = 5)
  884. : type(type), ne(ne), n_past(n_past) {}
  885. ggml_tensor * build_graph(ggml_context * ctx) override {
  886. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  887. ggml_tensor * out = ggml_diag_mask_inf(ctx, a, n_past);
  888. return out;
  889. }
  890. };
  891. // GGML_OP_SOFT_MAX
  892. struct test_soft_max : public test_case {
  893. const ggml_type type;
  894. const std::array<int64_t, 4> ne;
  895. std::string vars() override {
  896. return VARS_TO_STR2(type, ne);
  897. }
  898. test_soft_max(ggml_type type = GGML_TYPE_F32,
  899. std::array<int64_t, 4> ne = {10, 10, 10, 10})
  900. : type(type), ne(ne) {}
  901. ggml_tensor * build_graph(ggml_context * ctx) override {
  902. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  903. ggml_tensor * out = ggml_soft_max(ctx, a);
  904. return out;
  905. }
  906. };
  907. // GGML_OP_ROPE
  908. struct test_rope : public test_case {
  909. const ggml_type type;
  910. const std::array<int64_t, 4> ne;
  911. int n_dims;
  912. int mode;
  913. int n_ctx;
  914. std::string vars() override {
  915. return VARS_TO_STR5(type, ne, n_dims, mode, n_ctx);
  916. }
  917. test_rope(ggml_type type = GGML_TYPE_F32,
  918. std::array<int64_t, 4> ne = {10, 10, 10, 1},
  919. int n_dims = 10, int mode = 0, int n_ctx = 512)
  920. : type(type), ne(ne), n_dims(n_dims), mode(mode), n_ctx(n_ctx) {}
  921. ggml_tensor * build_graph(ggml_context * ctx) override {
  922. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  923. ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
  924. ggml_tensor * out = ggml_rope(ctx, a, pos, n_dims, mode, n_ctx);
  925. return out;
  926. }
  927. void initialize_tensors(ggml_context * ctx) override {
  928. for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
  929. if (t->type == GGML_TYPE_I32) {
  930. // pos
  931. std::vector<int> data(ne[2]);
  932. for (int i = 0; i < ne[2]; i++) {
  933. data[i] = rand() % n_ctx;
  934. }
  935. ggml_backend_tensor_set(t, data.data(), 0, ne[2] * sizeof(int));
  936. } else {
  937. init_tensor_uniform(t);
  938. }
  939. }
  940. }
  941. };
  942. // GGML_OP_ALIBI
  943. struct test_alibi : public test_case {
  944. const ggml_type type;
  945. const std::array<int64_t, 4> ne;
  946. int n_past;
  947. int n_head;
  948. float bias_max;
  949. std::string vars() override {
  950. return VARS_TO_STR5(type, ne, n_past, n_head, bias_max);
  951. }
  952. test_alibi(ggml_type type = GGML_TYPE_F32,
  953. std::array<int64_t, 4> ne = {10, 10, 10, 10},
  954. int n_past = 512, int n_head = 10, float bias_max = 0.5f)
  955. : type(type), ne(ne), n_past(n_past), n_head(n_head), bias_max(bias_max) {}
  956. ggml_tensor * build_graph(ggml_context * ctx) override {
  957. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  958. ggml_tensor * out = ggml_alibi(ctx, a, n_past, n_head, bias_max);
  959. return out;
  960. }
  961. };
  962. // GGML_OP_IM2COL
  963. struct test_im2col : public test_case {
  964. const ggml_type type_input;
  965. const ggml_type type_kernel;
  966. const std::array<int64_t, 4> ne_input;
  967. const std::array<int64_t, 4> ne_kernel;
  968. // stride
  969. const int s0;
  970. const int s1;
  971. // padding
  972. const int p0;
  973. const int p1;
  974. // dilatation
  975. const int d0;
  976. const int d1;
  977. // mode
  978. const bool is_2D;
  979. std::string vars() override {
  980. return VARS_TO_STR11(type_input, type_kernel, ne_input, ne_kernel, s0, s1, p0, p1, d0, d1, is_2D);
  981. }
  982. test_im2col(ggml_type type_input = GGML_TYPE_F32, ggml_type type_kernel = GGML_TYPE_F16,
  983. std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
  984. std::array<int64_t, 4> ne_kernel = {3, 3, 3, 1}, // [kernel_width, kernel_height, input_channels, 1]
  985. int s0 = 1, int s1 = 1,
  986. int p0 = 1, int p1 = 1,
  987. int d0 = 1, int d1 = 1,
  988. bool is_2D = true)
  989. : type_input(type_input), type_kernel(type_kernel), ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), is_2D(is_2D) {}
  990. ggml_tensor * build_graph(ggml_context * ctx) override {
  991. ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
  992. ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
  993. ggml_tensor * out = ggml_im2col(ctx, kernel, input, s0, s1, p0, p1, d0, d1, is_2D);
  994. return out;
  995. }
  996. };
  997. // GGML_OP_CONCAT
  998. struct test_concat : public test_case {
  999. const ggml_type type;
  1000. const std::array<int64_t, 4> ne;
  1001. const int64_t b_ne2;
  1002. std::string vars() override {
  1003. return VARS_TO_STR3(type, ne, b_ne2);
  1004. }
  1005. test_concat(ggml_type type = GGML_TYPE_F32,
  1006. std::array<int64_t, 4> ne = {10, 10, 10, 10},
  1007. int64_t b_ne2 = 10)
  1008. : type(type), ne(ne), b_ne2(b_ne2) {}
  1009. ggml_tensor * build_graph(ggml_context * ctx) override {
  1010. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  1011. ggml_tensor * b = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], b_ne2, ne[3]);
  1012. ggml_tensor * out = ggml_concat(ctx, a, b);
  1013. return out;
  1014. }
  1015. };
  1016. // GGML_OP_ARGSORT
  1017. struct test_argsort : public test_case {
  1018. const ggml_type type;
  1019. const std::array<int64_t, 4> ne;
  1020. ggml_sort_order order;
  1021. std::string vars() override {
  1022. return VARS_TO_STR3(type, ne, order);
  1023. }
  1024. test_argsort(ggml_type type = GGML_TYPE_F32,
  1025. std::array<int64_t, 4> ne = {16, 10, 10, 10},
  1026. ggml_sort_order order = GGML_SORT_ASC)
  1027. : type(type), ne(ne), order(order) {}
  1028. ggml_tensor * build_graph(ggml_context * ctx) override {
  1029. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  1030. ggml_tensor * out = ggml_argsort(ctx, a, order);
  1031. return out;
  1032. }
  1033. void initialize_tensors(ggml_context * ctx) override {
  1034. std::random_device rd;
  1035. std::default_random_engine rng(rd());
  1036. for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
  1037. if (t->type == GGML_TYPE_I32) {
  1038. // indices
  1039. std::vector<int> data(ggml_nelements(t));
  1040. for (int i = 0; i < ggml_nelements(t); i++) {
  1041. data[i] = rand();
  1042. }
  1043. std::shuffle(data.begin(), data.end(), rng);
  1044. ggml_backend_tensor_set(t, data.data(), 0, ne[0]*ne[1]*ne[2]*ne[3] * sizeof(int));
  1045. } else if (t->type == GGML_TYPE_F32) {
  1046. // initialize with unique values to avoid ties
  1047. for (int64_t r = 0; r < ggml_nrows(t); r++) {
  1048. std::vector<float> data(t->ne[0]);
  1049. for (int i = 0; i < t->ne[0]; i++) {
  1050. data[i] = i;
  1051. }
  1052. std::shuffle(data.begin(), data.end(), rng);
  1053. ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
  1054. }
  1055. } else {
  1056. GGML_ASSERT(false);
  1057. }
  1058. }
  1059. }
  1060. };
  1061. // GGML_OP_SUM_ROWS
  1062. struct test_sum_rows : public test_case {
  1063. const ggml_type type;
  1064. const std::array<int64_t, 4> ne;
  1065. std::string vars() override {
  1066. return VARS_TO_STR2(type, ne);
  1067. }
  1068. test_sum_rows(ggml_type type = GGML_TYPE_F32,
  1069. std::array<int64_t, 4> ne = {10, 10, 10, 10})
  1070. : type(type), ne(ne) {}
  1071. ggml_tensor * build_graph(ggml_context * ctx) override {
  1072. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  1073. ggml_tensor * out = ggml_sum_rows(ctx, a);
  1074. return out;
  1075. }
  1076. };
  1077. // GGML_OP_UPSCALE
  1078. struct test_upscale : public test_case {
  1079. const ggml_type type;
  1080. const std::array<int64_t, 4> ne;
  1081. const int32_t scale_factor;
  1082. std::string vars() override {
  1083. return VARS_TO_STR3(type, ne, scale_factor);
  1084. }
  1085. test_upscale(ggml_type type = GGML_TYPE_F32,
  1086. std::array<int64_t, 4> ne = {512, 512, 3, 1},
  1087. int32_t scale_factor = 2)
  1088. : type(type), ne(ne), scale_factor(scale_factor) {}
  1089. ggml_tensor * build_graph(ggml_context * ctx) override {
  1090. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  1091. ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
  1092. return out;
  1093. }
  1094. };
  1095. // GGML_OP_GROUP_NORM
  1096. struct test_group_norm : public test_case {
  1097. const ggml_type type;
  1098. const std::array<int64_t, 4> ne;
  1099. const int32_t num_groups;
  1100. std::string vars() override {
  1101. return VARS_TO_STR3(type, ne, num_groups);
  1102. }
  1103. test_group_norm(ggml_type type = GGML_TYPE_F32,
  1104. std::array<int64_t, 4> ne = {64, 64, 320, 1},
  1105. int32_t num_groups = 32)
  1106. : type(type), ne(ne), num_groups(num_groups) {}
  1107. ggml_tensor * build_graph(ggml_context * ctx) override {
  1108. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  1109. ggml_tensor * out = ggml_group_norm(ctx, a, num_groups);
  1110. return out;
  1111. }
  1112. };
  1113. // GGML_OP_ACC
  1114. struct test_acc : public test_case {
  1115. const ggml_type type;
  1116. const std::array<int64_t, 4> ne_a;
  1117. const std::array<int64_t, 4> ne_b;
  1118. std::string vars() override {
  1119. return VARS_TO_STR3(type, ne_a, ne_b);
  1120. }
  1121. test_acc(ggml_type type = GGML_TYPE_F32,
  1122. std::array<int64_t, 4> ne_a = {1024, 577, 1, 1},
  1123. std::array<int64_t, 4> ne_b = {1024, 576, 1, 1})
  1124. : type(type), ne_a(ne_a), ne_b(ne_b) {}
  1125. ggml_tensor * build_graph(ggml_context * ctx) override {
  1126. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
  1127. ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
  1128. ggml_tensor * out = ggml_acc(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], b->nb[1]);
  1129. return out;
  1130. }
  1131. };
  1132. // GGML_OP_PAD
  1133. struct test_pad : public test_case {
  1134. const ggml_type type;
  1135. const std::array<int64_t, 4> ne_a;
  1136. const int pad_0;
  1137. const int pad_1;
  1138. std::string vars() override {
  1139. return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
  1140. }
  1141. test_pad(ggml_type type = GGML_TYPE_F32,
  1142. std::array<int64_t, 4> ne_a = {512, 512, 1, 1},
  1143. int pad_0 = 1, int pad_1 = 1)
  1144. : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1) {}
  1145. ggml_tensor * build_graph(ggml_context * ctx) override {
  1146. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
  1147. ggml_tensor * out = ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
  1148. return out;
  1149. }
  1150. };
  1151. // GGML_OP_LEAKY_RELU
  1152. struct test_leaky_relu : public test_case {
  1153. const ggml_type type;
  1154. const std::array<int64_t, 4> ne_a;
  1155. const float negative_slope;
  1156. std::string vars() override {
  1157. return VARS_TO_STR3(type, ne_a, negative_slope);
  1158. }
  1159. test_leaky_relu(ggml_type type = GGML_TYPE_F32,
  1160. std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
  1161. float negative_slope = 0.1f)
  1162. : type(type), ne_a(ne_a), negative_slope(negative_slope) {}
  1163. ggml_tensor * build_graph(ggml_context * ctx) override {
  1164. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
  1165. ggml_tensor * out = ggml_leaky_relu(ctx, a, negative_slope, true);
  1166. return out;
  1167. }
  1168. };
  1169. // Mixtral MOE
  1170. struct test_moe : public test_case {
  1171. const int n_experts;
  1172. const int n_experts_per_tok;
  1173. const int n_tokens;
  1174. const int n_embd;
  1175. const int n_ff;
  1176. std::string op_desc(ggml_tensor * t) override {
  1177. return "MOE";
  1178. GGML_UNUSED(t);
  1179. }
  1180. std::string vars() override {
  1181. return VARS_TO_STR5(n_experts, n_experts_per_tok, n_tokens, n_embd, n_ff);
  1182. }
  1183. test_moe(int n_experts = 8, int n_experts_per_tok = 2, int n_tokens = 1, int n_embd = 4096, int n_ff = 14336)
  1184. : n_experts(n_experts), n_experts_per_tok(n_experts_per_tok), n_tokens(n_tokens), n_embd(n_embd), n_ff(n_ff) {
  1185. }
  1186. ggml_tensor * build_graph(ggml_context * ctx) override {
  1187. ggml_tensor * ffn_gate_inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_experts);
  1188. std::vector<ggml_tensor *> ffn_up_exp(n_experts);
  1189. std::vector<ggml_tensor *> ffn_gate_exp(n_experts);
  1190. std::vector<ggml_tensor *> ffn_down_exp(n_experts);
  1191. for (int i = 0; i < n_experts; ++i) {
  1192. ffn_up_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
  1193. ffn_gate_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
  1194. ffn_down_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
  1195. }
  1196. ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
  1197. ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur);
  1198. ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, 1.0f/sqrtf(n_embd));
  1199. // select experts
  1200. ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_experts_per_tok);
  1201. ggml_tensor * weights = ggml_get_rows(ctx,
  1202. ggml_reshape_3d(ctx, probs, 1, n_experts, n_tokens), selected_experts);
  1203. weights = ggml_reshape_2d(ctx, weights, n_experts_per_tok, n_tokens);
  1204. ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights);
  1205. weights = ggml_div(ctx, weights, weights_sum);
  1206. // compute expert outputs
  1207. ggml_tensor * moe_out = nullptr;
  1208. for (int i = 0; i < n_experts_per_tok; ++i) {
  1209. ggml_tensor * cur_expert;
  1210. ggml_tensor * cur_up = ggml_mul_mat_id(ctx, ffn_up_exp.data(), n_experts, selected_experts, i, cur);
  1211. ggml_tensor * cur_gate = ggml_mul_mat_id(ctx, ffn_gate_exp.data(), n_experts, selected_experts, i, cur);
  1212. cur_gate = ggml_silu(ctx, cur_gate);
  1213. cur_expert = ggml_mul(ctx, cur_up, cur_gate);
  1214. cur_expert = ggml_mul_mat_id(ctx, ffn_down_exp.data(), n_experts, selected_experts, i, cur_expert);
  1215. cur_expert = ggml_mul(ctx, cur_expert,
  1216. ggml_view_2d(ctx, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
  1217. if (i == 0) {
  1218. moe_out = cur_expert;
  1219. } else {
  1220. moe_out = ggml_add(ctx, moe_out, cur_expert);
  1221. }
  1222. }
  1223. cur = moe_out;
  1224. return cur;
  1225. }
  1226. };
  1227. static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
  1228. std::vector<std::unique_ptr<test_case>> test_cases;
  1229. std::default_random_engine rng(0);
  1230. const ggml_type all_types[] = {
  1231. GGML_TYPE_F32, GGML_TYPE_F16,
  1232. GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
  1233. GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
  1234. GGML_TYPE_Q8_0,
  1235. GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
  1236. GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
  1237. GGML_TYPE_Q6_K,
  1238. GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS,
  1239. };
  1240. // unary ops
  1241. for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
  1242. test_cases.emplace_back(new test_unary((ggml_unary_op) op));
  1243. }
  1244. test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false));
  1245. for (ggml_type type : all_types) {
  1246. for (int b : {1, 7}) {
  1247. for (bool v : {false, true}) {
  1248. test_cases.emplace_back(new test_get_rows(type, 256, 5, 4, b, v));
  1249. }
  1250. }
  1251. }
  1252. for (int b : {1, 7}) {
  1253. for (bool v : {false, true}) {
  1254. test_cases.emplace_back(new test_get_rows(GGML_TYPE_I32, 256, 5, 4, b, v));
  1255. }
  1256. }
  1257. test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
  1258. test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
  1259. test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 2, 1, 1}));
  1260. test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 2, 1}));
  1261. test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 2}));
  1262. test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 10, 10, 10}, {2, 1, 1, 1}));
  1263. test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 10, 10, 10}, {1, 1, 1, 2}));
  1264. test_cases.emplace_back(new test_dup(GGML_TYPE_F32));
  1265. test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
  1266. test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
  1267. test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
  1268. test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
  1269. test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
  1270. for (ggml_type type : all_types) {
  1271. test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, type, {256, 10, 10, 1}));
  1272. }
  1273. test_cases.emplace_back(new test_cont());
  1274. auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
  1275. for (auto op : {ggml_add, ggml_mul, ggml_div}) {
  1276. test_cases.emplace_back(new test_bin_bcast(op, type, ne, nr));
  1277. }
  1278. };
  1279. add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1});
  1280. add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1, 1}, {32, 1, 1, 1});
  1281. add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1});
  1282. add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 1, 1}, {1, 1, 1, 1});
  1283. add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 1}, {1, 1, 1, 1});
  1284. add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 1, 1});
  1285. add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {2, 1, 1, 1});
  1286. add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 2, 1, 1});
  1287. add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 2, 1});
  1288. add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 1, 2});
  1289. add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 2, 2});
  1290. add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 2, 2, 2});
  1291. add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {2, 2, 2, 2});
  1292. // stable diffusion
  1293. add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 1, 1, 1});
  1294. add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 16, 16, 1});
  1295. add_test_bin_bcast(GGML_TYPE_F32, {1280, 16, 16, 1}, {1, 1, 1, 1});
  1296. add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 256, 1, 1});
  1297. add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1280, 1}, {16, 16, 1, 1});
  1298. add_test_bin_bcast(GGML_TYPE_F32, {16, 16, 1280, 1}, {1, 1, 1, 1});
  1299. add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1920, 1}, {16, 16, 1, 1});
  1300. add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 2560, 1}, {16, 16, 1, 1});
  1301. add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1280, 1}, {32, 32, 1, 1});
  1302. add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1920, 1}, {32, 32, 1, 1});
  1303. add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 640, 1}, {32, 32, 1, 1});
  1304. add_test_bin_bcast(GGML_TYPE_F32, {5120, 1, 1, 1}, {1, 256, 1, 1});
  1305. add_test_bin_bcast(GGML_TYPE_F32, {640, 1, 1, 1}, {1, 1, 1, 1});
  1306. //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {1, 1, 1, 1});
  1307. //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {2, 1, 1, 1});
  1308. test_cases.emplace_back(new test_scale());
  1309. for (float eps : {1e-6f, 1e-5f, 1e-3f, 1e-1f}) {
  1310. test_cases.emplace_back(new test_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
  1311. test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
  1312. }
  1313. for (ggml_type type_a : all_types) {
  1314. for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
  1315. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
  1316. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {1, 1}));
  1317. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {2, 1}));
  1318. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 1}));
  1319. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 1}));
  1320. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 2}));
  1321. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 2}));
  1322. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 1, 1}, {1, 1}));
  1323. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 1}, {1, 1}));
  1324. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 1}, {2, 1}));
  1325. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 1}));
  1326. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 1}));
  1327. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 2}));
  1328. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
  1329. }
  1330. }
  1331. for (ggml_type type_a : all_types) {
  1332. for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
  1333. for (int n_mats : {2, 4, 8}) {
  1334. for (int id = 0; id < n_mats; id++) {
  1335. for (bool v : {false, true}) {
  1336. test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, id, 16, 16, 256, v));
  1337. }
  1338. }
  1339. }
  1340. }
  1341. }
  1342. test_cases.emplace_back(new test_sqr());
  1343. test_cases.emplace_back(new test_clamp());
  1344. test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5));
  1345. test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 1}, 5));
  1346. test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 10}, 5));
  1347. std::uniform_int_distribution<> dist_ne1(1, 50);
  1348. int exponent = 1;
  1349. while (exponent < (1 << 17)) {
  1350. std::uniform_int_distribution<> dist_ne0(exponent, 2*exponent);
  1351. for (int n = 0; n < 10; ++n) {
  1352. int64_t ne0 = dist_ne0(rng);
  1353. int64_t ne1 = dist_ne1(rng);
  1354. test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}));
  1355. }
  1356. exponent <<= 1;
  1357. }
  1358. for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
  1359. test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512)); // llama 7B
  1360. test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512)); // llama 13B
  1361. test_cases.emplace_back(new test_rope(type, {128, 52, 10, 1}, 128, 0, 512)); // llama 30B
  1362. test_cases.emplace_back(new test_rope(type, {128, 64, 10, 1}, 128, 0, 512)); // llama 65B
  1363. test_cases.emplace_back(new test_rope(type, { 64, 1, 10, 1}, 64, 2, 512)); // neox (falcon 7B)
  1364. test_cases.emplace_back(new test_rope(type, { 64, 71, 10, 1}, 64, 2, 512)); // neox (falcon 7B)
  1365. test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512)); // neox (falcon 40B)
  1366. test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512)); // neox (falcon 40B)
  1367. test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512)); // neox (stablelm)
  1368. test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512)); // neox (phi-2)
  1369. }
  1370. test_cases.emplace_back(new test_alibi());
  1371. test_cases.emplace_back(new test_im2col());
  1372. test_cases.emplace_back(new test_concat(GGML_TYPE_F32));
  1373. test_cases.emplace_back(new test_concat(GGML_TYPE_I32));
  1374. for (ggml_sort_order order : {GGML_SORT_ASC, GGML_SORT_DESC}) {
  1375. test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
  1376. test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
  1377. }
  1378. test_cases.emplace_back(new test_sum_rows());
  1379. test_cases.emplace_back(new test_upscale());
  1380. test_cases.emplace_back(new test_group_norm());
  1381. test_cases.emplace_back(new test_acc());
  1382. test_cases.emplace_back(new test_pad());
  1383. test_cases.emplace_back(new test_leaky_relu());
  1384. #if !defined(__SANITIZE_THREAD__)
  1385. // FIXME: these tests use too much memory with thread sanitizer
  1386. test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
  1387. //test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
  1388. #endif
  1389. // run tests
  1390. if (mode == MODE_TEST) {
  1391. ggml_backend_t backend_cpu = ggml_backend_cpu_init();
  1392. size_t n_ok = 0;
  1393. for (auto & test : test_cases) {
  1394. if (test->eval(backend, backend_cpu, op_name)) {
  1395. n_ok++;
  1396. }
  1397. }
  1398. printf(" %zu/%zu tests passed\n", n_ok, test_cases.size());
  1399. ggml_backend_free(backend_cpu);
  1400. return n_ok == test_cases.size();
  1401. }
  1402. if (mode == MODE_PERF) {
  1403. for (auto & test : test_cases) {
  1404. test->eval_perf(backend, op_name);
  1405. }
  1406. return true;
  1407. }
  1408. GGML_ASSERT(false);
  1409. return false;
  1410. }
  1411. static void usage(char ** argv) {
  1412. printf("Usage: %s [mode] [-o op] [-b backend]\n", argv[0]);
  1413. printf(" valid modes are: test (compare with CPU backend for correctness) or perf (performance evaluation)\n");
  1414. printf(" op names are as given by ggml_op_desc()\n");
  1415. }
  1416. int main(int argc, char ** argv) {
  1417. test_mode mode = MODE_TEST;
  1418. const char * op_name = NULL;
  1419. const char * backend = NULL;
  1420. for (int i = 1; i < argc; i++) {
  1421. if (strcmp(argv[i], "test") == 0) {
  1422. mode = MODE_TEST;
  1423. } else if (strcmp(argv[i], "perf") == 0) {
  1424. mode = MODE_PERF;
  1425. } else if (strcmp(argv[i], "-o") == 0) {
  1426. if (i + 1 < argc) {
  1427. op_name = argv[++i];
  1428. } else {
  1429. usage(argv);
  1430. return 1;
  1431. }
  1432. } else if (strcmp(argv[i], "-b") == 0) {
  1433. if (i + 1 < argc) {
  1434. backend = argv[++i];
  1435. } else {
  1436. usage(argv);
  1437. return 1;
  1438. }
  1439. } else {
  1440. usage(argv);
  1441. return 1;
  1442. }
  1443. }
  1444. // enumerate backends
  1445. printf("Testing %zu backends\n\n", ggml_backend_reg_get_count());
  1446. size_t n_ok = 0;
  1447. for (size_t i = 0; i < ggml_backend_reg_get_count(); i++) {
  1448. printf("Backend %zu/%zu (%s)\n", i + 1, ggml_backend_reg_get_count(), ggml_backend_reg_get_name(i));
  1449. if (backend != NULL && strcmp(backend, ggml_backend_reg_get_name(i)) != 0) {
  1450. printf(" Skipping\n");
  1451. n_ok++;
  1452. continue;
  1453. }
  1454. ggml_backend_t backend = ggml_backend_reg_init_backend(i, NULL);
  1455. GGML_ASSERT(backend != NULL);
  1456. printf(" Backend name: %s\n", ggml_backend_name(backend));
  1457. bool ok = test_backend(backend, mode, op_name);
  1458. printf(" Backend %s: ", ggml_backend_name(backend));
  1459. if (ok) {
  1460. printf("\033[1;32mOK\033[0m\n");
  1461. n_ok++;
  1462. } else {
  1463. printf("\033[1;31mFAIL\033[0m\n");
  1464. }
  1465. printf("\n");
  1466. ggml_backend_free(backend);
  1467. }
  1468. printf("%zu/%zu backends passed\n", n_ok, ggml_backend_reg_get_count());
  1469. if (n_ok != ggml_backend_reg_get_count()) {
  1470. printf("\033[1;31mFAIL\033[0m\n");
  1471. return 1;
  1472. }
  1473. ggml_quantize_free();
  1474. printf("\033[1;32mOK\033[0m\n");
  1475. return 0;
  1476. }