test-backend-ops.cpp 60 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767
  1. #include <ggml.h>
  2. #include <ggml-alloc.h>
  3. #include <ggml-backend.h>
  4. #include <ggml-backend-impl.h>
  5. #include <algorithm>
  6. #include <array>
  7. #include <cfloat>
  8. #include <cstring>
  9. #include <functional>
  10. #include <memory>
  11. #include <random>
  12. #include <stdio.h>
  13. #include <stdlib.h>
  14. #include <string>
  15. #include <thread>
  16. #include <vector>
  17. static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
  18. // static RNG initialization (revisit if n_threads stops being constant)
  19. static const size_t n_threads = std::thread::hardware_concurrency();
  20. static std::vector<std::default_random_engine> generators = []() {
  21. std::random_device rd;
  22. std::vector<std::default_random_engine> vec;
  23. vec.reserve(n_threads);
  24. //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
  25. for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
  26. return vec;
  27. }();
  28. size_t size = ggml_nelements(tensor);
  29. std::vector<float> data(size);
  30. auto init_thread = [&](size_t ith, size_t start, size_t end) {
  31. std::uniform_real_distribution<float> distribution(min, max);
  32. for (size_t i = start; i < end; i++) {
  33. data[i] = distribution(generators[ith]);
  34. }
  35. };
  36. std::vector<std::thread> threads;
  37. threads.reserve(n_threads);
  38. for (size_t i = 0; i < n_threads; i++) {
  39. size_t start = i*size/n_threads;
  40. size_t end = (i+1)*size/n_threads;
  41. threads.emplace_back(init_thread, i, start, end);
  42. }
  43. for (auto & t : threads) {
  44. t.join();
  45. }
  46. if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
  47. ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
  48. } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) {
  49. GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
  50. std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
  51. int64_t hist[16];
  52. std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
  53. const float * im = imatrix.data();
  54. if (!ggml_quantize_requires_imatrix(tensor->type)) {
  55. // when the imatrix is optional, we want to test both quantization with and without imatrix
  56. // use one of the random numbers to decide
  57. if (data[0] > 0.5f*(min + max)) {
  58. im = nullptr;
  59. }
  60. }
  61. ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], hist, im);
  62. ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
  63. } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
  64. // This is going to create some weird integers though.
  65. ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
  66. } else {
  67. GGML_ASSERT(false);
  68. }
  69. }
  70. static std::vector<float> tensor_to_float(const ggml_tensor * t) {
  71. std::vector<float> tv;
  72. tv.reserve(ggml_nelements(t));
  73. std::vector<uint8_t> buf(ggml_nbytes(t));
  74. ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
  75. ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
  76. size_t bs = ggml_blck_size(t->type);
  77. std::vector<float> vq(ggml_blck_size(t->type));
  78. bool quantized = ggml_is_quantized(t->type);
  79. // access elements by index to avoid gaps in views
  80. for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
  81. for (int64_t i2 = 0; i2 < t->ne[2]; i2++) {
  82. for (int64_t i1 = 0; i1 < t->ne[1]; i1++) {
  83. for (int64_t i0 = 0; i0 < t->ne[0]; i0 += bs) {
  84. size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0];
  85. if (t->type == GGML_TYPE_F16) {
  86. tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]));
  87. } else if (t->type == GGML_TYPE_F32) {
  88. tv.push_back(*(float *) &buf[i]);
  89. } else if (t->type == GGML_TYPE_I32) {
  90. tv.push_back((float)*(int32_t *) &buf[i]);
  91. } else if (t->type == GGML_TYPE_I16) {
  92. tv.push_back((float)*(int16_t *) &buf[i]);
  93. } else if (t->type == GGML_TYPE_I8) {
  94. tv.push_back((float)*(int8_t *) &buf[i]);
  95. } else if (quantized) {
  96. std::vector<float> vq(ggml_blck_size(t->type));
  97. tt.to_float(&buf[i], vq.data(), ggml_blck_size(t->type));
  98. tv.insert(tv.end(), vq.begin(), vq.end());
  99. } else {
  100. GGML_ASSERT(false);
  101. }
  102. }
  103. }
  104. }
  105. }
  106. return tv;
  107. }
  108. /*
  109. static double cosine_similarity(const float * v1, const float * v2, size_t n) {
  110. double dot = 0.0;
  111. double mag1 = 0.0;
  112. double mag2 = 0.0;
  113. for (size_t i = 0; i < n; i++) {
  114. if (std::isnan(v1[i]) || std::isnan(v2[i])) {
  115. return -1.0f;
  116. }
  117. if (std::isinf(v1[i]) && std::isinf(v2[i])) {
  118. continue;
  119. }
  120. dot += v1[i]*v2[i];
  121. mag1 += v1[i]*v1[i];
  122. mag2 += v2[i]*v2[i];
  123. }
  124. return dot/sqrt(mag1*mag2);
  125. }
  126. static float distance(const float * v1, const float * v2, size_t n) {
  127. double d = 0.0;
  128. for (size_t i = 0; i < n; i++) {
  129. if (std::isnan(v1[i]) || std::isnan(v2[i])) {
  130. return INFINITY;
  131. }
  132. if (std::isinf(v1[i]) && std::isinf(v2[i])) {
  133. continue;
  134. }
  135. d += (v1[i] - v2[i])*(v1[i] - v2[i]);
  136. }
  137. return sqrt(d);
  138. }
  139. static float vec_len(const float * v, size_t n) {
  140. double d = 0.0;
  141. for (size_t i = 0; i < n; i++) {
  142. if (std::isnan(v[i])) {
  143. return INFINITY;
  144. }
  145. if (std::isinf(v[i])) {
  146. continue;
  147. }
  148. d += v[i]*v[i];
  149. }
  150. return sqrt(d);
  151. }
  152. */
  153. // normalized mean squared error = mse(a, b) / mse(a, 0)
  154. static double nmse(const float * a, const float * b, size_t n) {
  155. double mse_a_b = 0.0;
  156. double mse_a_0 = 0.0;
  157. for (size_t i = 0; i < n; i++) {
  158. float a_i = a[i];
  159. float b_i = b[i];
  160. mse_a_b += (a_i - b_i) * (a_i - b_i);
  161. mse_a_0 += a_i * a_i;
  162. }
  163. return mse_a_b / mse_a_0;
  164. }
  165. // utils for printing the variables of the test cases
  166. #define VAR_TO_STR(x) (#x "=" + var_to_str(x))
  167. template<typename T>
  168. static std::string var_to_str(const T & x) {
  169. return std::to_string(x);
  170. }
  171. template<typename T, size_t N>
  172. static std::string var_to_str(const T (&x)[N]) {
  173. std::string s = "[";
  174. for (size_t i = 0; i < N; i++) {
  175. if (i > 0) {
  176. s += ",";
  177. }
  178. s += var_to_str(x[i]);
  179. }
  180. s += "]";
  181. return s;
  182. }
  183. template<typename T, size_t N>
  184. static std::string var_to_str(const std::array<T, N> & x) {
  185. std::string s = "[";
  186. for (size_t i = 0; i < N; i++) {
  187. if (i > 0) {
  188. s += ",";
  189. }
  190. s += var_to_str(x[i]);
  191. }
  192. s += "]";
  193. return s;
  194. }
  195. //static std::string var_to_str(ggml_unary_op unary_op) {
  196. // return ggml_unary_op_name(unary_op);
  197. //}
  198. static std::string var_to_str(ggml_type type) {
  199. return ggml_type_name(type);
  200. }
  201. #define VARS_TO_STR1(a) VAR_TO_STR(a)
  202. #define VARS_TO_STR2(a, b) VAR_TO_STR(a) + "," + VAR_TO_STR(b)
  203. #define VARS_TO_STR3(a, b, c) VAR_TO_STR(a) + "," + VARS_TO_STR2(b, c)
  204. #define VARS_TO_STR4(a, b, c, d) VAR_TO_STR(a) + "," + VARS_TO_STR3(b, c, d)
  205. #define VARS_TO_STR5(a, b, c, d, e) VAR_TO_STR(a) + "," + VARS_TO_STR4(b, c, d, e)
  206. #define VARS_TO_STR6(a, b, c, d, e, f) VAR_TO_STR(a) + "," + VARS_TO_STR5(b, c, d, e, f)
  207. #define VARS_TO_STR7(a, b, c, d, e, f, g) VAR_TO_STR(a) + "," + VARS_TO_STR6(b, c, d, e, f, g)
  208. #define VARS_TO_STR8(a, b, c, d, e, f, g, h) VAR_TO_STR(a) + "," + VARS_TO_STR7(b, c, d, e, f, g, h)
  209. #define VARS_TO_STR9(a, b, c, d, e, f, g, h, i) VAR_TO_STR(a) + "," + VARS_TO_STR8(b, c, d, e, f, g, h, i)
  210. #define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
  211. #define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)
  212. // accept FLT_MAX as infinity
  213. static bool isinf_or_max(float f) {
  214. return std::isinf(f) || f == FLT_MAX || f == -FLT_MAX;
  215. }
  216. static bool ggml_is_view_op(enum ggml_op op) {
  217. return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
  218. }
  219. enum test_mode {
  220. MODE_TEST,
  221. MODE_PERF,
  222. };
  223. struct test_case {
  224. virtual ~test_case() {}
  225. virtual std::string op_desc(ggml_tensor * t) {
  226. return ggml_op_desc(t);
  227. }
  228. virtual std::string vars() {
  229. return "";
  230. }
  231. virtual ggml_tensor * build_graph(ggml_context * ctx) = 0;
  232. virtual double max_nmse_err() {
  233. return 1e-7;
  234. }
  235. virtual void initialize_tensors(ggml_context * ctx) {
  236. for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
  237. init_tensor_uniform(t);
  238. }
  239. }
  240. virtual size_t op_size(ggml_tensor * t) {
  241. size_t size = ggml_nbytes(t);
  242. // add source tensors
  243. for (int i = 0; i < GGML_MAX_SRC; i++) {
  244. if (t->src[i] != NULL) {
  245. size += ggml_nbytes(t->src[i]);
  246. }
  247. }
  248. return size;
  249. }
  250. ggml_cgraph * gf = nullptr;
  251. static const int sentinel_size = 1024;
  252. test_mode mode;
  253. std::vector<ggml_tensor *> sentinels;
  254. void add_sentinel(ggml_context * ctx) {
  255. if (mode == MODE_PERF) {
  256. return;
  257. }
  258. ggml_tensor * sentinel = ::ggml_new_tensor_1d(ctx, GGML_TYPE_F32, sentinel_size);
  259. ggml_format_name(sentinel, "sent_%zu", sentinels.size());
  260. sentinels.push_back(sentinel);
  261. }
  262. // hijack ggml_new_tensor to add sentinels after each tensor to check for overflows in the backend
  263. ggml_tensor * ggml_new_tensor(ggml_context * ctx, ggml_type type, int n_dims, const int64_t * ne) {
  264. ggml_tensor * t = ::ggml_new_tensor(ctx, type, n_dims, ne);
  265. add_sentinel(ctx);
  266. return t;
  267. }
  268. ggml_tensor * ggml_new_tensor_1d(ggml_context * ctx, ggml_type type, int64_t ne0) {
  269. ggml_tensor * t = ::ggml_new_tensor_1d(ctx, type, ne0);
  270. add_sentinel(ctx);
  271. return t;
  272. }
  273. ggml_tensor * ggml_new_tensor_2d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1) {
  274. ggml_tensor * t = ::ggml_new_tensor_2d(ctx, type, ne0, ne1);
  275. add_sentinel(ctx);
  276. return t;
  277. }
  278. ggml_tensor * ggml_new_tensor_3d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2) {
  279. ggml_tensor * t = ::ggml_new_tensor_3d(ctx, type, ne0, ne1, ne2);
  280. add_sentinel(ctx);
  281. return t;
  282. }
  283. ggml_tensor * ggml_new_tensor_4d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
  284. ggml_tensor * t = ::ggml_new_tensor_4d(ctx, type, ne0, ne1, ne2, ne3);
  285. add_sentinel(ctx);
  286. return t;
  287. }
  288. bool eval(ggml_backend_t backend1, ggml_backend_t backend2, const char * op_name) {
  289. mode = MODE_TEST;
  290. ggml_init_params params = {
  291. /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
  292. /* .mem_base = */ NULL,
  293. /* .no_alloc = */ true,
  294. };
  295. ggml_context * ctx = ggml_init(params);
  296. gf = ggml_new_graph(ctx);
  297. // pre-graph sentinel
  298. add_sentinel(ctx);
  299. ggml_tensor * out = build_graph(ctx);
  300. if (op_name != nullptr && op_desc(out) != op_name) {
  301. //printf(" %s: skipping\n", op_desc(out).c_str());
  302. ggml_free(ctx);
  303. return true;
  304. }
  305. printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
  306. fflush(stdout);
  307. // check if backends support op
  308. bool supported = true;
  309. for (ggml_backend_t backend : {backend1, backend2}) {
  310. if (!ggml_backend_supports_op(backend, out)) {
  311. printf("not supported [%s] ", ggml_backend_name(backend));
  312. supported = false;
  313. }
  314. }
  315. if (!supported) {
  316. printf("\n");
  317. ggml_free(ctx);
  318. return true;
  319. }
  320. // post-graph sentinel
  321. add_sentinel(ctx);
  322. // allocate
  323. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1);
  324. if (buf == NULL) {
  325. printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1));
  326. ggml_free(ctx);
  327. return false;
  328. }
  329. // build graph
  330. ggml_build_forward_expand(gf, out);
  331. // add sentinels as graph nodes so that they are checked in the callback
  332. for (ggml_tensor * sentinel : sentinels) {
  333. gf->nodes[gf->n_nodes++] = sentinel;
  334. }
  335. // randomize tensors
  336. initialize_tensors(ctx);
  337. // compare
  338. struct callback_userdata {
  339. bool ok;
  340. double max_err;
  341. ggml_backend_t backend1;
  342. ggml_backend_t backend2;
  343. };
  344. callback_userdata ud {
  345. true,
  346. max_nmse_err(),
  347. backend1,
  348. backend2
  349. };
  350. auto callback = [](int index, ggml_tensor * t1, ggml_tensor * t2, void * user_data) -> bool {
  351. callback_userdata * ud = (callback_userdata *) user_data;
  352. const char * bn1 = ggml_backend_name(ud->backend1);
  353. const char * bn2 = ggml_backend_name(ud->backend2);
  354. if (t1->op == GGML_OP_NONE) {
  355. // sentinels must be unchanged
  356. std::vector<uint8_t> t1_data(ggml_nbytes(t1));
  357. std::vector<uint8_t> t2_data(ggml_nbytes(t2));
  358. ggml_backend_tensor_get(t1, t1_data.data(), 0, ggml_nbytes(t1));
  359. ggml_backend_tensor_get(t2, t2_data.data(), 0, ggml_nbytes(t2));
  360. if (memcmp(t1_data.data(), t2_data.data(), ggml_nbytes(t1)) != 0) {
  361. printf("sentinel mismatch: %s ", t1->name);
  362. ud->ok = false;
  363. return true;
  364. }
  365. }
  366. std::vector<float> f1 = tensor_to_float(t1);
  367. std::vector<float> f2 = tensor_to_float(t2);
  368. for (size_t i = 0; i < f1.size(); i++) {
  369. // check for nans
  370. if (std::isnan(f1[i]) || std::isnan(f2[i])) {
  371. printf("[%s] NaN at index %zu (%s=%f %s=%f) ", ggml_op_desc(t1), i, bn1, f1[i], bn2, f2[i]);
  372. ud->ok = false;
  373. return true;
  374. }
  375. // check for infs: both must be inf of the same sign, or both must be finite
  376. if (isinf_or_max(f1[i]) || isinf_or_max(f2[i])) {
  377. if (isinf_or_max(f1[i]) && isinf_or_max(f2[i])) {
  378. if (std::signbit(f1[i]) != std::signbit(f2[i])) {
  379. printf("[%s] inf sign mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
  380. ud->ok = false;
  381. return true;
  382. }
  383. } else {
  384. printf("[%s] inf mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
  385. ud->ok = false;
  386. return true;
  387. }
  388. }
  389. }
  390. double err = nmse(f1.data(), f2.data(), f1.size());
  391. if (err > ud->max_err) {
  392. printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err);
  393. //for (int i = 0; i < (int) f1.size(); i++) {
  394. // printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
  395. //}
  396. //printf("\n");
  397. //exit(1);
  398. ud->ok = false;
  399. }
  400. return true;
  401. GGML_UNUSED(index);
  402. };
  403. const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud);
  404. if (!cmp_ok) {
  405. printf("compare failed ");
  406. }
  407. ggml_backend_buffer_free(buf);
  408. ggml_free(ctx);
  409. if (ud.ok && cmp_ok) {
  410. printf("\033[1;32mOK\033[0m\n");
  411. return true;
  412. }
  413. printf("\033[1;31mFAIL\033[0m\n");
  414. return false;
  415. }
  416. bool eval_perf(ggml_backend_t backend, const char * op_name) {
  417. mode = MODE_PERF;
  418. static const size_t graph_nodes = 8192;
  419. ggml_init_params params = {
  420. /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead_custom(graph_nodes, false),
  421. /* .mem_base = */ NULL,
  422. /* .no_alloc = */ true,
  423. };
  424. ggml_context * ctx = ggml_init(params);
  425. ggml_tensor * out = build_graph(ctx);
  426. if (op_name != nullptr && op_desc(out) != op_name) {
  427. //printf(" %s: skipping\n", op_desc(out).c_str());
  428. ggml_free(ctx);
  429. return true;
  430. }
  431. int len = printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
  432. fflush(stdout);
  433. // check if backends support op
  434. if (!ggml_backend_supports_op(backend, out)) {
  435. printf("not supported\n");
  436. ggml_free(ctx);
  437. return true;
  438. }
  439. // align while also leaving some margin for variations in parameters
  440. int align = 20;
  441. int last = (len + align - 1) / align * align;
  442. if (last - len < 5) {
  443. last += align;
  444. }
  445. last = std::max(last, 60);
  446. printf("%*s", last - len, "");
  447. // allocate
  448. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
  449. if (buf == NULL) {
  450. printf("failed to allocate tensors\n");
  451. ggml_free(ctx);
  452. return false;
  453. }
  454. // randomize tensors
  455. initialize_tensors(ctx);
  456. // build graph
  457. ggml_cgraph * gf = ggml_new_graph_custom(ctx, graph_nodes, false);
  458. ggml_build_forward_expand(gf, out);
  459. // warmup run
  460. ggml_backend_graph_compute(backend, gf);
  461. // duplicate the op
  462. size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
  463. int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1;
  464. for (int i = 1; i < n_runs; i++) {
  465. gf->nodes[gf->n_nodes++] = out;
  466. }
  467. // calculate memory
  468. size_t mem = n_runs * op_size(out);
  469. auto tensor_op_size = [](ggml_tensor * t) {
  470. size_t size = ggml_nbytes(t);
  471. // add source tensors
  472. for (int i = 0; i < GGML_MAX_SRC; i++) {
  473. if (t->src[i] != NULL) {
  474. size += ggml_nbytes(t->src[i]);
  475. }
  476. }
  477. return size;
  478. };
  479. for (int i = 0; i < gf->n_nodes; i++) {
  480. if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) {
  481. continue;
  482. }
  483. mem += tensor_op_size(gf->nodes[i]);
  484. }
  485. // run
  486. ggml_backend_synchronize(backend);
  487. int64_t start_time = ggml_time_us();
  488. ggml_backend_graph_compute(backend, gf);
  489. ggml_backend_synchronize(backend);
  490. int64_t end_time = ggml_time_us();
  491. double time_us = end_time - start_time;
  492. printf(" %5d runs - %8.2f us/run - %8zu kB/run - \033[1;34m%7.2f GB/s\033[0m\n",
  493. n_runs,
  494. time_us / n_runs,
  495. op_size(out) / 1024,
  496. mem / (time_us/1e6) / 1024.0 / 1024.0 / 1024.0);
  497. ggml_backend_buffer_free(buf);
  498. ggml_free(ctx);
  499. return true;
  500. }
  501. };
  502. // GGML_OP_UNARY
  503. struct test_unary : public test_case {
  504. const ggml_unary_op op;
  505. const ggml_type type;
  506. const std::array<int64_t, 4> ne;
  507. std::string vars() override {
  508. return VARS_TO_STR2(type, ne);
  509. }
  510. test_unary(ggml_unary_op op,
  511. ggml_type type = GGML_TYPE_F32,
  512. std::array<int64_t, 4> ne = {128, 10, 10, 10})
  513. : op(op), type(type), ne(ne) {}
  514. ggml_tensor * build_graph(ggml_context * ctx) override {
  515. ggml_tensor * in = ggml_new_tensor(ctx, type, 4, ne.data());
  516. ggml_tensor * out = ggml_unary(ctx, in, op);
  517. return out;
  518. }
  519. };
  520. // GGML_OP_GET_ROWS
  521. struct test_get_rows : public test_case {
  522. const ggml_type type;
  523. const int n; // cols
  524. const int m; // rows
  525. const int r; // rows to get
  526. const int b; // batch size
  527. const bool v; // view (non-contiguous src1)
  528. std::string vars() override {
  529. return VARS_TO_STR6(type, n, m, r, b, v);
  530. }
  531. test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1, bool v = false)
  532. : type(type), n(n), m(m), r(r), b(b), v(v) {}
  533. ggml_tensor * build_graph(ggml_context * ctx) override {
  534. ggml_tensor * in = ggml_new_tensor_3d(ctx, type, n, m, b);
  535. ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b);
  536. if (v) {
  537. rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0);
  538. }
  539. ggml_tensor * out = ggml_get_rows(ctx, in, rows);
  540. return out;
  541. }
  542. void initialize_tensors(ggml_context * ctx) override {
  543. for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
  544. if (t->type == GGML_TYPE_I32) {
  545. if (ggml_is_view_op(t->op)) { continue; }
  546. // rows
  547. std::vector<int> data(r*b);
  548. for (int i = 0; i < r*b; i++) {
  549. data[i] = rand() % m;
  550. }
  551. ggml_backend_tensor_set(t, data.data(), 0, r * b * sizeof(int));
  552. } else {
  553. init_tensor_uniform(t);
  554. }
  555. }
  556. }
  557. };
  558. // GGML_OP_REPEAT
  559. struct test_repeat : public test_case {
  560. const ggml_type type;
  561. const std::array<int64_t, 4> ne;
  562. const std::array<int, 4> nr;
  563. std::string vars() override {
  564. return VARS_TO_STR3(type, ne, nr);
  565. }
  566. size_t op_size(ggml_tensor * t) override {
  567. return ggml_nbytes(t) * 2;
  568. }
  569. test_repeat(ggml_type type = GGML_TYPE_F32,
  570. std::array<int64_t, 4> ne = {10, 10, 10, 10},
  571. std::array<int, 4> nr = {2, 2, 2, 2})
  572. : type(type), ne(ne), nr(nr) {}
  573. ggml_tensor * build_graph(ggml_context * ctx) override {
  574. ggml_tensor * target = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
  575. ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
  576. ggml_tensor * out = ggml_repeat(ctx, src, target);
  577. return out;
  578. }
  579. };
  580. // GGML_OP_DUP
  581. struct test_dup : public test_case {
  582. const ggml_type type;
  583. const std::array<int64_t, 4> ne;
  584. const std::array<int64_t, 4> permute;
  585. bool _use_permute;
  586. std::string vars() override {
  587. std::string v = VARS_TO_STR2(type, ne);
  588. if (_use_permute) v += "," + VAR_TO_STR(permute);
  589. return v;
  590. }
  591. test_dup(ggml_type type = GGML_TYPE_F32,
  592. std::array<int64_t, 4> ne = {10, 10, 10, 1},
  593. std::array<int64_t, 4> permute = {0, 0, 0, 0})
  594. : type(type), ne(ne), permute(permute),
  595. _use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
  596. ggml_tensor * build_graph(ggml_context * ctx) override {
  597. ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
  598. if (_use_permute) {
  599. src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
  600. }
  601. ggml_tensor * out = ggml_dup(ctx, src);
  602. return out;
  603. }
  604. };
  605. // GGML_OP_CPY
  606. struct test_cpy : public test_case {
  607. const ggml_type type_src;
  608. const ggml_type type_dst;
  609. const std::array<int64_t, 4> ne;
  610. std::string vars() override {
  611. return VARS_TO_STR3(type_src, type_dst, ne);
  612. }
  613. size_t op_size(ggml_tensor * t) override {
  614. return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
  615. }
  616. test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
  617. std::array<int64_t, 4> ne = {10, 10, 10, 1})
  618. : type_src(type_src), type_dst(type_dst), ne(ne) {}
  619. ggml_tensor * build_graph(ggml_context * ctx) override {
  620. ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
  621. ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, ne.data());
  622. ggml_tensor * out = ggml_cpy(ctx, src, dst);
  623. return out;
  624. }
  625. };
  626. // GGML_OP_CONT
  627. struct test_cont : public test_case {
  628. const ggml_type type;
  629. const std::array<int64_t, 4> ne;
  630. std::string vars() override {
  631. return VARS_TO_STR2(type, ne);
  632. }
  633. test_cont(ggml_type type = GGML_TYPE_F32,
  634. std::array<int64_t, 4> ne = {10, 10, 10, 1})
  635. : type(type), ne(ne) {}
  636. ggml_tensor * build_graph(ggml_context * ctx) override {
  637. ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
  638. src = ggml_transpose(ctx, src);
  639. ggml_tensor * out = ggml_cont(ctx, src);
  640. return out;
  641. }
  642. };
  643. // GGML_OP_ADD
  644. // GGML_OP_MUL
  645. // GGML_OP_DIV
  646. struct test_bin_bcast : public test_case {
  647. using op_t = ggml_tensor * (*) (ggml_context *, ggml_tensor *, ggml_tensor *);
  648. op_t op;
  649. const ggml_type type;
  650. const std::array<int64_t, 4> ne;
  651. const std::array<int, 4> nr;
  652. std::string vars() override {
  653. return VARS_TO_STR3(type, ne, nr);
  654. }
  655. size_t op_size(ggml_tensor * t) override {
  656. return ggml_nbytes(t) * 3;
  657. }
  658. test_bin_bcast(op_t op, ggml_type type = GGML_TYPE_F32,
  659. std::array<int64_t, 4> ne = {10, 10, 1, 1},
  660. std::array<int, 4> nr = {1, 2, 1, 1})
  661. : op(op), type(type), ne(ne), nr(nr) {}
  662. ggml_tensor * build_graph(ggml_context * ctx) override {
  663. ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
  664. ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
  665. ggml_tensor * out = op(ctx, a, b);
  666. return out;
  667. }
  668. void initialize_tensors(ggml_context * ctx) override {
  669. for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
  670. if (op == ggml_div) {
  671. // avoid division by zero
  672. init_tensor_uniform(t, 1.0f, 2.0f);
  673. } else {
  674. init_tensor_uniform(t);
  675. }
  676. }
  677. }
  678. };
  679. // GGML_OP_SCALE
  680. struct test_scale : public test_case {
  681. const ggml_type type;
  682. const std::array<int64_t, 4> ne;
  683. float scale;
  684. std::string vars() override {
  685. return VARS_TO_STR3(type, ne, scale);
  686. }
  687. test_scale(ggml_type type = GGML_TYPE_F32,
  688. std::array<int64_t, 4> ne = {10, 10, 10, 10},
  689. float scale = 2.0f)
  690. : type(type), ne(ne), scale(scale) {}
  691. ggml_tensor * build_graph(ggml_context * ctx) override {
  692. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  693. ggml_tensor * out = ggml_scale(ctx, a, scale);
  694. return out;
  695. }
  696. };
  697. // GGML_OP_NORM
  698. struct test_norm : public test_case {
  699. const ggml_type type;
  700. const std::array<int64_t, 4> ne;
  701. float eps;
  702. std::string vars() override {
  703. return VARS_TO_STR3(type, ne, eps);
  704. }
  705. test_norm(ggml_type type = GGML_TYPE_F32,
  706. std::array<int64_t, 4> ne = {64, 10, 10, 10},
  707. float eps = 1e-6f)
  708. : type(type), ne(ne), eps(eps) {}
  709. ggml_tensor * build_graph(ggml_context * ctx) override {
  710. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  711. ggml_tensor * out = ggml_norm(ctx, a, eps);
  712. return out;
  713. }
  714. };
  715. // GGML_OP_RMS_NORM
  716. struct test_rms_norm : public test_case {
  717. const ggml_type type;
  718. const std::array<int64_t, 4> ne;
  719. float eps;
  720. std::string vars() override {
  721. return VARS_TO_STR3(type, ne, eps);
  722. }
  723. test_rms_norm(ggml_type type = GGML_TYPE_F32,
  724. std::array<int64_t, 4> ne = {64, 10, 10, 10},
  725. float eps = 1e-6f)
  726. : type(type), ne(ne), eps(eps) {}
  727. ggml_tensor * build_graph(ggml_context * ctx) override {
  728. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  729. ggml_tensor * out = ggml_rms_norm(ctx, a, eps);
  730. return out;
  731. }
  732. };
  733. // GGML_OP_MUL_MAT
  734. struct test_mul_mat : public test_case {
  735. const ggml_type type_a;
  736. const ggml_type type_b;
  737. const int64_t m;
  738. const int64_t n;
  739. const int64_t k;
  740. const std::array<int64_t, 2> bs; // dims 3 and 4
  741. const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
  742. std::string vars() override {
  743. return VARS_TO_STR7(type_a, type_b, m, n, k, bs, nr);
  744. }
  745. double max_nmse_err() override {
  746. return 5e-4;
  747. }
  748. size_t op_size(ggml_tensor * t) override {
  749. size_t a = ggml_nbytes(t->src[0]) * n * nr[0] * nr[1];
  750. size_t b = ggml_nbytes(t->src[1]) * m;
  751. size_t c = ggml_nbytes(t);
  752. return a + b + c;
  753. GGML_UNUSED(t);
  754. }
  755. test_mul_mat(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
  756. int64_t m = 32, int64_t n = 32, int64_t k = 32,
  757. std::array<int64_t, 2> bs = {10, 10},
  758. std::array<int64_t, 2> nr = {2, 2})
  759. : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr) {}
  760. ggml_tensor * build_graph(ggml_context * ctx) override {
  761. // C^T = A * B^T: (k, m) * (k, n) => (m, n)
  762. ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0] , bs[1]);
  763. ggml_tensor * b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
  764. ggml_tensor * out = ggml_mul_mat(ctx, a, b);
  765. return out;
  766. }
  767. };
  768. // GGML_OP_MUL_MAT_ID
  769. struct test_mul_mat_id : public test_case {
  770. const ggml_type type_a;
  771. const ggml_type type_b;
  772. const int n_mats;
  773. const int id;
  774. const int64_t m;
  775. const int64_t n;
  776. const int64_t k;
  777. const bool v; // view (non-contiguous ids)
  778. std::string vars() override {
  779. return VARS_TO_STR8(type_a, type_b, n_mats, id, m, n, k, v);
  780. }
  781. double max_nmse_err() override {
  782. return 5e-4;
  783. }
  784. size_t op_size(ggml_tensor * t) override {
  785. size_t a = ggml_nbytes(t->src[2]) * n;
  786. size_t b = ggml_nbytes(t->src[1]) * m;
  787. size_t c = ggml_nbytes(t);
  788. return a + b + c;
  789. GGML_UNUSED(t);
  790. }
  791. test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
  792. int n_mats = 2, int id = 0,
  793. int64_t m = 32, int64_t n = 32, int64_t k = 32, bool v = false)
  794. : type_a(type_a), type_b(type_b), n_mats(n_mats), id(id),
  795. m(m), n(n), k(k), v(v) {}
  796. ggml_tensor * build_graph(ggml_context * ctx) override {
  797. // C^T = A * B^T: (k, m) * (k, n) => (m, n)
  798. std::vector<ggml_tensor *> mats;
  799. for (int i = 0; i < n_mats; i++) {
  800. ggml_tensor * a = ggml_new_tensor_2d(ctx, type_a, k, m);
  801. mats.push_back(a);
  802. }
  803. ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
  804. if (v) {
  805. ids = ggml_view_2d(ctx, ids, n_mats/2, ids->ne[1], ids->nb[1], 0);
  806. }
  807. ggml_tensor * b = ggml_new_tensor_2d(ctx, type_b, k, n);
  808. ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), n_mats, ids, v ? id/2 : id, b);
  809. return out;
  810. }
  811. void initialize_tensors(ggml_context * ctx) override {
  812. std::random_device rd;
  813. std::default_random_engine rng(rd());
  814. for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
  815. if (t->type == GGML_TYPE_I32) {
  816. if (ggml_is_view_op(t->op)) { continue; }
  817. // ids
  818. for (int64_t r = 0; r < ggml_nrows(t); r++) {
  819. std::vector<int32_t> data(t->ne[0]);
  820. for (int i = 0; i < t->ne[0]; i++) {
  821. data[i] = i % n_mats;
  822. }
  823. std::shuffle(data.begin(), data.end(), rng);
  824. ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
  825. }
  826. } else {
  827. init_tensor_uniform(t);
  828. }
  829. }
  830. }
  831. };
  832. // GGML_OP_SQR
  833. struct test_sqr : public test_case {
  834. const ggml_type type;
  835. const std::array<int64_t, 4> ne;
  836. std::string vars() override {
  837. return VARS_TO_STR2(type, ne);
  838. }
  839. test_sqr(ggml_type type = GGML_TYPE_F32,
  840. std::array<int64_t, 4> ne = {10, 10, 10, 10})
  841. : type(type), ne(ne) {}
  842. ggml_tensor * build_graph(ggml_context * ctx) override {
  843. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  844. ggml_tensor * out = ggml_sqr(ctx, a);
  845. return out;
  846. }
  847. };
  848. // GGML_OP_CLAMP
  849. struct test_clamp : public test_case {
  850. const ggml_type type;
  851. const std::array<int64_t, 4> ne;
  852. float min;
  853. float max;
  854. std::string vars() override {
  855. return VARS_TO_STR4(type, ne, min, max);
  856. }
  857. test_clamp(ggml_type type = GGML_TYPE_F32,
  858. std::array<int64_t, 4> ne = {10, 10, 10, 10},
  859. float min = -0.5f, float max = 0.5f)
  860. : type(type), ne(ne), min(min), max(max) {}
  861. ggml_tensor * build_graph(ggml_context * ctx) override {
  862. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  863. ggml_tensor * out = ggml_clamp(ctx, a, min, max);
  864. return out;
  865. }
  866. };
  867. // GGML_OP_DIAG_MASK_INF
  868. struct test_diag_mask_inf : public test_case {
  869. const ggml_type type;
  870. const std::array<int64_t, 4> ne;
  871. const int n_past;
  872. std::string vars() override {
  873. return VARS_TO_STR3(type, ne, n_past);
  874. }
  875. test_diag_mask_inf(ggml_type type = GGML_TYPE_F32,
  876. std::array<int64_t, 4> ne = {10, 10, 10, 10},
  877. int n_past = 5)
  878. : type(type), ne(ne), n_past(n_past) {}
  879. ggml_tensor * build_graph(ggml_context * ctx) override {
  880. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  881. ggml_tensor * out = ggml_diag_mask_inf(ctx, a, n_past);
  882. return out;
  883. }
  884. };
  885. // GGML_OP_SOFT_MAX
  886. struct test_soft_max : public test_case {
  887. const ggml_type type;
  888. const std::array<int64_t, 4> ne;
  889. std::string vars() override {
  890. return VARS_TO_STR2(type, ne);
  891. }
  892. test_soft_max(ggml_type type = GGML_TYPE_F32,
  893. std::array<int64_t, 4> ne = {10, 10, 10, 10})
  894. : type(type), ne(ne) {}
  895. ggml_tensor * build_graph(ggml_context * ctx) override {
  896. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  897. ggml_tensor * out = ggml_soft_max(ctx, a);
  898. return out;
  899. }
  900. };
  901. // GGML_OP_ROPE
  902. struct test_rope : public test_case {
  903. const ggml_type type;
  904. const std::array<int64_t, 4> ne;
  905. int n_dims;
  906. int mode;
  907. int n_ctx;
  908. std::string vars() override {
  909. return VARS_TO_STR5(type, ne, n_dims, mode, n_ctx);
  910. }
  911. test_rope(ggml_type type = GGML_TYPE_F32,
  912. std::array<int64_t, 4> ne = {10, 10, 10, 1},
  913. int n_dims = 10, int mode = 0, int n_ctx = 512)
  914. : type(type), ne(ne), n_dims(n_dims), mode(mode), n_ctx(n_ctx) {}
  915. ggml_tensor * build_graph(ggml_context * ctx) override {
  916. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  917. ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
  918. ggml_tensor * out = ggml_rope(ctx, a, pos, n_dims, mode, n_ctx);
  919. return out;
  920. }
  921. void initialize_tensors(ggml_context * ctx) override {
  922. for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
  923. if (t->type == GGML_TYPE_I32) {
  924. // pos
  925. std::vector<int> data(ne[2]);
  926. for (int i = 0; i < ne[2]; i++) {
  927. data[i] = rand() % n_ctx;
  928. }
  929. ggml_backend_tensor_set(t, data.data(), 0, ne[2] * sizeof(int));
  930. } else {
  931. init_tensor_uniform(t);
  932. }
  933. }
  934. }
  935. };
  936. // GGML_OP_ALIBI
  937. struct test_alibi : public test_case {
  938. const ggml_type type;
  939. const std::array<int64_t, 4> ne;
  940. int n_past;
  941. int n_head;
  942. float bias_max;
  943. std::string vars() override {
  944. return VARS_TO_STR5(type, ne, n_past, n_head, bias_max);
  945. }
  946. test_alibi(ggml_type type = GGML_TYPE_F32,
  947. std::array<int64_t, 4> ne = {10, 10, 10, 10},
  948. int n_past = 512, int n_head = 10, float bias_max = 0.5f)
  949. : type(type), ne(ne), n_past(n_past), n_head(n_head), bias_max(bias_max) {}
  950. ggml_tensor * build_graph(ggml_context * ctx) override {
  951. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  952. ggml_tensor * out = ggml_alibi(ctx, a, n_past, n_head, bias_max);
  953. return out;
  954. }
  955. };
  956. // GGML_OP_IM2COL
  957. struct test_im2col : public test_case {
  958. const ggml_type type_input;
  959. const ggml_type type_kernel;
  960. const std::array<int64_t, 4> ne_input;
  961. const std::array<int64_t, 4> ne_kernel;
  962. // stride
  963. const int s0;
  964. const int s1;
  965. // padding
  966. const int p0;
  967. const int p1;
  968. // dilatation
  969. const int d0;
  970. const int d1;
  971. // mode
  972. const bool is_2D;
  973. std::string vars() override {
  974. return VARS_TO_STR11(type_input, type_kernel, ne_input, ne_kernel, s0, s1, p0, p1, d0, d1, is_2D);
  975. }
  976. test_im2col(ggml_type type_input = GGML_TYPE_F32, ggml_type type_kernel = GGML_TYPE_F16,
  977. std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
  978. std::array<int64_t, 4> ne_kernel = {3, 3, 3, 1}, // [kernel_width, kernel_height, input_channels, 1]
  979. int s0 = 1, int s1 = 1,
  980. int p0 = 1, int p1 = 1,
  981. int d0 = 1, int d1 = 1,
  982. bool is_2D = true)
  983. : type_input(type_input), type_kernel(type_kernel), ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), is_2D(is_2D) {}
  984. ggml_tensor * build_graph(ggml_context * ctx) override {
  985. ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
  986. ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
  987. ggml_tensor * out = ggml_im2col(ctx, kernel, input, s0, s1, p0, p1, d0, d1, is_2D);
  988. return out;
  989. }
  990. };
  991. // GGML_OP_CONCAT
  992. struct test_concat : public test_case {
  993. const ggml_type type;
  994. const std::array<int64_t, 4> ne;
  995. const int64_t b_ne2;
  996. std::string vars() override {
  997. return VARS_TO_STR3(type, ne, b_ne2);
  998. }
  999. test_concat(ggml_type type = GGML_TYPE_F32,
  1000. std::array<int64_t, 4> ne = {10, 10, 10, 10},
  1001. int64_t b_ne2 = 10)
  1002. : type(type), ne(ne), b_ne2(b_ne2) {}
  1003. ggml_tensor * build_graph(ggml_context * ctx) override {
  1004. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  1005. ggml_tensor * b = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], b_ne2, ne[3]);
  1006. ggml_tensor * out = ggml_concat(ctx, a, b);
  1007. return out;
  1008. }
  1009. };
  1010. // GGML_OP_ARGSORT
  1011. struct test_argsort : public test_case {
  1012. const ggml_type type;
  1013. const std::array<int64_t, 4> ne;
  1014. ggml_sort_order order;
  1015. std::string vars() override {
  1016. return VARS_TO_STR3(type, ne, order);
  1017. }
  1018. test_argsort(ggml_type type = GGML_TYPE_F32,
  1019. std::array<int64_t, 4> ne = {16, 10, 10, 10},
  1020. ggml_sort_order order = GGML_SORT_ASC)
  1021. : type(type), ne(ne), order(order) {}
  1022. ggml_tensor * build_graph(ggml_context * ctx) override {
  1023. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  1024. ggml_tensor * out = ggml_argsort(ctx, a, order);
  1025. return out;
  1026. }
  1027. void initialize_tensors(ggml_context * ctx) override {
  1028. std::random_device rd;
  1029. std::default_random_engine rng(rd());
  1030. for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
  1031. if (t->type == GGML_TYPE_I32) {
  1032. // indices
  1033. std::vector<int> data(ggml_nelements(t));
  1034. for (int i = 0; i < ggml_nelements(t); i++) {
  1035. data[i] = rand();
  1036. }
  1037. std::shuffle(data.begin(), data.end(), rng);
  1038. ggml_backend_tensor_set(t, data.data(), 0, ne[0]*ne[1]*ne[2]*ne[3] * sizeof(int));
  1039. } else if (t->type == GGML_TYPE_F32) {
  1040. // initialize with unique values to avoid ties
  1041. for (int64_t r = 0; r < ggml_nrows(t); r++) {
  1042. std::vector<float> data(t->ne[0]);
  1043. for (int i = 0; i < t->ne[0]; i++) {
  1044. data[i] = i;
  1045. }
  1046. std::shuffle(data.begin(), data.end(), rng);
  1047. ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
  1048. }
  1049. } else {
  1050. GGML_ASSERT(false);
  1051. }
  1052. }
  1053. }
  1054. };
  1055. // GGML_OP_SUM_ROWS
  1056. struct test_sum_rows : public test_case {
  1057. const ggml_type type;
  1058. const std::array<int64_t, 4> ne;
  1059. std::string vars() override {
  1060. return VARS_TO_STR2(type, ne);
  1061. }
  1062. test_sum_rows(ggml_type type = GGML_TYPE_F32,
  1063. std::array<int64_t, 4> ne = {10, 10, 10, 10})
  1064. : type(type), ne(ne) {}
  1065. ggml_tensor * build_graph(ggml_context * ctx) override {
  1066. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  1067. ggml_tensor * out = ggml_sum_rows(ctx, a);
  1068. return out;
  1069. }
  1070. };
  1071. // GGML_OP_UPSCALE
  1072. struct test_upscale : public test_case {
  1073. const ggml_type type;
  1074. const std::array<int64_t, 4> ne;
  1075. const int32_t scale_factor;
  1076. std::string vars() override {
  1077. return VARS_TO_STR3(type, ne, scale_factor);
  1078. }
  1079. test_upscale(ggml_type type = GGML_TYPE_F32,
  1080. std::array<int64_t, 4> ne = {512, 512, 3, 1},
  1081. int32_t scale_factor = 2)
  1082. : type(type), ne(ne), scale_factor(scale_factor) {}
  1083. ggml_tensor * build_graph(ggml_context * ctx) override {
  1084. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  1085. ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
  1086. return out;
  1087. }
  1088. };
  1089. // GGML_OP_GROUP_NORM
  1090. struct test_group_norm : public test_case {
  1091. const ggml_type type;
  1092. const std::array<int64_t, 4> ne;
  1093. const int32_t num_groups;
  1094. std::string vars() override {
  1095. return VARS_TO_STR3(type, ne, num_groups);
  1096. }
  1097. test_group_norm(ggml_type type = GGML_TYPE_F32,
  1098. std::array<int64_t, 4> ne = {64, 64, 320, 1},
  1099. int32_t num_groups = 32)
  1100. : type(type), ne(ne), num_groups(num_groups) {}
  1101. ggml_tensor * build_graph(ggml_context * ctx) override {
  1102. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
  1103. ggml_tensor * out = ggml_group_norm(ctx, a, num_groups);
  1104. return out;
  1105. }
  1106. };
  1107. // GGML_OP_ACC
  1108. struct test_acc : public test_case {
  1109. const ggml_type type;
  1110. const std::array<int64_t, 4> ne_a;
  1111. const std::array<int64_t, 4> ne_b;
  1112. std::string vars() override {
  1113. return VARS_TO_STR3(type, ne_a, ne_b);
  1114. }
  1115. test_acc(ggml_type type = GGML_TYPE_F32,
  1116. std::array<int64_t, 4> ne_a = {1024, 577, 1, 1},
  1117. std::array<int64_t, 4> ne_b = {1024, 576, 1, 1})
  1118. : type(type), ne_a(ne_a), ne_b(ne_b) {}
  1119. ggml_tensor * build_graph(ggml_context * ctx) override {
  1120. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
  1121. ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
  1122. ggml_tensor * out = ggml_acc(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], b->nb[1]);
  1123. return out;
  1124. }
  1125. };
  1126. // GGML_OP_PAD
  1127. struct test_pad : public test_case {
  1128. const ggml_type type;
  1129. const std::array<int64_t, 4> ne_a;
  1130. const int pad_0;
  1131. const int pad_1;
  1132. std::string vars() override {
  1133. return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
  1134. }
  1135. test_pad(ggml_type type = GGML_TYPE_F32,
  1136. std::array<int64_t, 4> ne_a = {512, 512, 1, 1},
  1137. int pad_0 = 1, int pad_1 = 1)
  1138. : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1) {}
  1139. ggml_tensor * build_graph(ggml_context * ctx) override {
  1140. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
  1141. ggml_tensor * out = ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
  1142. return out;
  1143. }
  1144. };
  1145. // GGML_OP_LEAKY_RELU
  1146. struct test_leaky_relu : public test_case {
  1147. const ggml_type type;
  1148. const std::array<int64_t, 4> ne_a;
  1149. const float negative_slope;
  1150. std::string vars() override {
  1151. return VARS_TO_STR3(type, ne_a, negative_slope);
  1152. }
  1153. test_leaky_relu(ggml_type type = GGML_TYPE_F32,
  1154. std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
  1155. float negative_slope = 0.1f)
  1156. : type(type), ne_a(ne_a), negative_slope(negative_slope) {}
  1157. ggml_tensor * build_graph(ggml_context * ctx) override {
  1158. ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
  1159. ggml_tensor * out = ggml_leaky_relu(ctx, a, negative_slope, true);
  1160. return out;
  1161. }
  1162. };
  1163. // Mixtral MOE
  1164. struct test_moe : public test_case {
  1165. const int n_experts;
  1166. const int n_experts_per_tok;
  1167. const int n_tokens;
  1168. const int n_embd;
  1169. const int n_ff;
  1170. std::string op_desc(ggml_tensor * t) override {
  1171. return "MOE";
  1172. GGML_UNUSED(t);
  1173. }
  1174. std::string vars() override {
  1175. return VARS_TO_STR5(n_experts, n_experts_per_tok, n_tokens, n_embd, n_ff);
  1176. }
  1177. test_moe(int n_experts = 8, int n_experts_per_tok = 2, int n_tokens = 1, int n_embd = 4096, int n_ff = 14336)
  1178. : n_experts(n_experts), n_experts_per_tok(n_experts_per_tok), n_tokens(n_tokens), n_embd(n_embd), n_ff(n_ff) {
  1179. }
  1180. ggml_tensor * build_graph(ggml_context * ctx) override {
  1181. ggml_tensor * ffn_gate_inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_experts);
  1182. std::vector<ggml_tensor *> ffn_up_exp(n_experts);
  1183. std::vector<ggml_tensor *> ffn_gate_exp(n_experts);
  1184. std::vector<ggml_tensor *> ffn_down_exp(n_experts);
  1185. for (int i = 0; i < n_experts; ++i) {
  1186. ffn_up_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
  1187. ffn_gate_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
  1188. ffn_down_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
  1189. }
  1190. ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
  1191. ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur);
  1192. ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, 1.0f/sqrtf(n_embd));
  1193. // select experts
  1194. ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_experts_per_tok);
  1195. ggml_tensor * weights = ggml_get_rows(ctx,
  1196. ggml_reshape_3d(ctx, probs, 1, n_experts, n_tokens), selected_experts);
  1197. weights = ggml_reshape_2d(ctx, weights, n_experts_per_tok, n_tokens);
  1198. ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights);
  1199. weights = ggml_div(ctx, weights, weights_sum);
  1200. // compute expert outputs
  1201. ggml_tensor * moe_out = nullptr;
  1202. for (int i = 0; i < n_experts_per_tok; ++i) {
  1203. ggml_tensor * cur_expert;
  1204. ggml_tensor * cur_up = ggml_mul_mat_id(ctx, ffn_up_exp.data(), n_experts, selected_experts, i, cur);
  1205. ggml_tensor * cur_gate = ggml_mul_mat_id(ctx, ffn_gate_exp.data(), n_experts, selected_experts, i, cur);
  1206. cur_gate = ggml_silu(ctx, cur_gate);
  1207. cur_expert = ggml_mul(ctx, cur_up, cur_gate);
  1208. cur_expert = ggml_mul_mat_id(ctx, ffn_down_exp.data(), n_experts, selected_experts, i, cur_expert);
  1209. cur_expert = ggml_mul(ctx, cur_expert,
  1210. ggml_view_2d(ctx, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
  1211. if (i == 0) {
  1212. moe_out = cur_expert;
  1213. } else {
  1214. moe_out = ggml_add(ctx, moe_out, cur_expert);
  1215. }
  1216. }
  1217. cur = moe_out;
  1218. return cur;
  1219. }
  1220. };
  1221. static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
  1222. std::vector<std::unique_ptr<test_case>> test_cases;
  1223. std::default_random_engine rng(0);
  1224. const ggml_type all_types[] = {
  1225. GGML_TYPE_F32, GGML_TYPE_F16,
  1226. GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
  1227. GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
  1228. GGML_TYPE_Q8_0,
  1229. GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
  1230. GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
  1231. GGML_TYPE_Q6_K,
  1232. GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS,
  1233. };
  1234. // unary ops
  1235. for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
  1236. test_cases.emplace_back(new test_unary((ggml_unary_op) op));
  1237. }
  1238. test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false));
  1239. for (ggml_type type : all_types) {
  1240. for (int b : {1, 7}) {
  1241. for (bool v : {false, true}) {
  1242. test_cases.emplace_back(new test_get_rows(type, 256, 5, 4, b, v));
  1243. }
  1244. }
  1245. }
  1246. for (int b : {1, 7}) {
  1247. for (bool v : {false, true}) {
  1248. test_cases.emplace_back(new test_get_rows(GGML_TYPE_I32, 256, 5, 4, b, v));
  1249. }
  1250. }
  1251. test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
  1252. test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
  1253. test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 2, 1, 1}));
  1254. test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 2, 1}));
  1255. test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 2}));
  1256. test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 10, 10, 10}, {2, 1, 1, 1}));
  1257. test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 10, 10, 10}, {1, 1, 1, 2}));
  1258. test_cases.emplace_back(new test_dup(GGML_TYPE_F32));
  1259. test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
  1260. test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
  1261. test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
  1262. test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
  1263. test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
  1264. for (ggml_type type : all_types) {
  1265. test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, type, {256, 10, 10, 1}));
  1266. }
  1267. test_cases.emplace_back(new test_cont());
  1268. auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
  1269. for (auto op : {ggml_add, ggml_mul, ggml_div}) {
  1270. test_cases.emplace_back(new test_bin_bcast(op, type, ne, nr));
  1271. }
  1272. };
  1273. add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1});
  1274. add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1, 1}, {32, 1, 1, 1});
  1275. add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1});
  1276. add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 1, 1}, {1, 1, 1, 1});
  1277. add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 1}, {1, 1, 1, 1});
  1278. add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 1, 1});
  1279. add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {2, 1, 1, 1});
  1280. add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 2, 1, 1});
  1281. add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 2, 1});
  1282. add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 1, 2});
  1283. add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 2, 2});
  1284. add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 2, 2, 2});
  1285. add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {2, 2, 2, 2});
  1286. // stable diffusion
  1287. add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 1, 1, 1});
  1288. add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 16, 16, 1});
  1289. add_test_bin_bcast(GGML_TYPE_F32, {1280, 16, 16, 1}, {1, 1, 1, 1});
  1290. add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 256, 1, 1});
  1291. add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1280, 1}, {16, 16, 1, 1});
  1292. add_test_bin_bcast(GGML_TYPE_F32, {16, 16, 1280, 1}, {1, 1, 1, 1});
  1293. add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1920, 1}, {16, 16, 1, 1});
  1294. add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 2560, 1}, {16, 16, 1, 1});
  1295. add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1280, 1}, {32, 32, 1, 1});
  1296. add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1920, 1}, {32, 32, 1, 1});
  1297. add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 640, 1}, {32, 32, 1, 1});
  1298. add_test_bin_bcast(GGML_TYPE_F32, {5120, 1, 1, 1}, {1, 256, 1, 1});
  1299. add_test_bin_bcast(GGML_TYPE_F32, {640, 1, 1, 1}, {1, 1, 1, 1});
  1300. //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {1, 1, 1, 1});
  1301. //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {2, 1, 1, 1});
  1302. test_cases.emplace_back(new test_scale());
  1303. for (float eps : {1e-6f, 1e-5f, 1e-3f, 1e-1f}) {
  1304. test_cases.emplace_back(new test_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
  1305. test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
  1306. }
  1307. for (ggml_type type_a : all_types) {
  1308. for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
  1309. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
  1310. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {1, 1}));
  1311. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {2, 1}));
  1312. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 1}));
  1313. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 1}));
  1314. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 2}));
  1315. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 2}));
  1316. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 1, 1}, {1, 1}));
  1317. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 1}, {1, 1}));
  1318. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 1}, {2, 1}));
  1319. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 1}));
  1320. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 1}));
  1321. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 2}));
  1322. test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
  1323. }
  1324. }
  1325. for (ggml_type type_a : all_types) {
  1326. for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
  1327. for (int n_mats : {2, 4, 8}) {
  1328. for (int id = 0; id < n_mats; id++) {
  1329. for (bool v : {false, true}) {
  1330. test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, id, 16, 16, 256, v));
  1331. }
  1332. }
  1333. }
  1334. }
  1335. }
  1336. test_cases.emplace_back(new test_sqr());
  1337. test_cases.emplace_back(new test_clamp());
  1338. test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5));
  1339. test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 1}, 5));
  1340. test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 10}, 5));
  1341. std::uniform_int_distribution<> dist_ne1(1, 50);
  1342. int exponent = 1;
  1343. while (exponent < (1 << 17)) {
  1344. std::uniform_int_distribution<> dist_ne0(exponent, 2*exponent);
  1345. for (int n = 0; n < 10; ++n) {
  1346. int64_t ne0 = dist_ne0(rng);
  1347. int64_t ne1 = dist_ne1(rng);
  1348. test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}));
  1349. }
  1350. exponent <<= 1;
  1351. }
  1352. for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
  1353. test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512)); // llama 7B
  1354. test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512)); // llama 13B
  1355. test_cases.emplace_back(new test_rope(type, {128, 52, 10, 1}, 128, 0, 512)); // llama 30B
  1356. test_cases.emplace_back(new test_rope(type, {128, 64, 10, 1}, 128, 0, 512)); // llama 65B
  1357. test_cases.emplace_back(new test_rope(type, { 64, 1, 10, 1}, 64, 2, 512)); // neox (falcon 7B)
  1358. test_cases.emplace_back(new test_rope(type, { 64, 71, 10, 1}, 64, 2, 512)); // neox (falcon 7B)
  1359. test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512)); // neox (falcon 40B)
  1360. test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512)); // neox (falcon 40B)
  1361. test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512)); // neox (stablelm)
  1362. test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512)); // neox (phi-2)
  1363. }
  1364. test_cases.emplace_back(new test_alibi());
  1365. test_cases.emplace_back(new test_im2col());
  1366. test_cases.emplace_back(new test_concat(GGML_TYPE_F32));
  1367. test_cases.emplace_back(new test_concat(GGML_TYPE_I32));
  1368. for (ggml_sort_order order : {GGML_SORT_ASC, GGML_SORT_DESC}) {
  1369. test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
  1370. test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
  1371. }
  1372. test_cases.emplace_back(new test_sum_rows());
  1373. test_cases.emplace_back(new test_upscale());
  1374. test_cases.emplace_back(new test_group_norm());
  1375. test_cases.emplace_back(new test_acc());
  1376. test_cases.emplace_back(new test_pad());
  1377. test_cases.emplace_back(new test_leaky_relu());
  1378. #if !defined(__SANITIZE_THREAD__)
  1379. // FIXME: these tests use too much memory with thread sanitizer
  1380. test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
  1381. //test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
  1382. #endif
  1383. // run tests
  1384. if (mode == MODE_TEST) {
  1385. ggml_backend_t backend_cpu = ggml_backend_cpu_init();
  1386. size_t n_ok = 0;
  1387. for (auto & test : test_cases) {
  1388. if (test->eval(backend, backend_cpu, op_name)) {
  1389. n_ok++;
  1390. }
  1391. }
  1392. printf(" %zu/%zu tests passed\n", n_ok, test_cases.size());
  1393. ggml_backend_free(backend_cpu);
  1394. return n_ok == test_cases.size();
  1395. }
  1396. if (mode == MODE_PERF) {
  1397. for (auto & test : test_cases) {
  1398. test->eval_perf(backend, op_name);
  1399. }
  1400. return true;
  1401. }
  1402. GGML_ASSERT(false);
  1403. return false;
  1404. }
  1405. static void usage(char ** argv) {
  1406. printf("Usage: %s [mode] [-o op] [-b backend]\n", argv[0]);
  1407. printf(" valid modes are: test (compare with CPU backend for correctness) or perf (performance evaluation)\n");
  1408. printf(" op names are as given by ggml_op_desc()\n");
  1409. }
  1410. int main(int argc, char ** argv) {
  1411. test_mode mode = MODE_TEST;
  1412. const char * op_name = NULL;
  1413. const char * backend = NULL;
  1414. for (int i = 1; i < argc; i++) {
  1415. if (strcmp(argv[i], "test") == 0) {
  1416. mode = MODE_TEST;
  1417. } else if (strcmp(argv[i], "perf") == 0) {
  1418. mode = MODE_PERF;
  1419. } else if (strcmp(argv[i], "-o") == 0) {
  1420. if (i + 1 < argc) {
  1421. op_name = argv[++i];
  1422. } else {
  1423. usage(argv);
  1424. return 1;
  1425. }
  1426. } else if (strcmp(argv[i], "-b") == 0) {
  1427. if (i + 1 < argc) {
  1428. backend = argv[++i];
  1429. } else {
  1430. usage(argv);
  1431. return 1;
  1432. }
  1433. } else {
  1434. usage(argv);
  1435. return 1;
  1436. }
  1437. }
  1438. // enumerate backends
  1439. printf("Testing %zu backends\n\n", ggml_backend_reg_get_count());
  1440. size_t n_ok = 0;
  1441. for (size_t i = 0; i < ggml_backend_reg_get_count(); i++) {
  1442. printf("Backend %zu/%zu (%s)\n", i + 1, ggml_backend_reg_get_count(), ggml_backend_reg_get_name(i));
  1443. if (backend != NULL && strcmp(backend, ggml_backend_reg_get_name(i)) != 0) {
  1444. printf(" Skipping\n");
  1445. n_ok++;
  1446. continue;
  1447. }
  1448. ggml_backend_t backend = ggml_backend_reg_init_backend(i, NULL);
  1449. GGML_ASSERT(backend != NULL);
  1450. printf(" Backend name: %s\n", ggml_backend_name(backend));
  1451. bool ok = test_backend(backend, mode, op_name);
  1452. printf(" Backend %s: ", ggml_backend_name(backend));
  1453. if (ok) {
  1454. printf("\033[1;32mOK\033[0m\n");
  1455. n_ok++;
  1456. } else {
  1457. printf("\033[1;31mFAIL\033[0m\n");
  1458. }
  1459. printf("\n");
  1460. ggml_backend_free(backend);
  1461. }
  1462. printf("%zu/%zu backends passed\n", n_ok, ggml_backend_reg_get_count());
  1463. if (n_ok != ggml_backend_reg_get_count()) {
  1464. printf("\033[1;31mFAIL\033[0m\n");
  1465. return 1;
  1466. }
  1467. ggml_quantize_free();
  1468. printf("\033[1;32mOK\033[0m\n");
  1469. return 0;
  1470. }