test-quantize-perf.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. // Benchmark quantization specific functions on synthetic data
  2. #include "ggml.h"
  3. #undef NDEBUG
  4. #include <algorithm>
  5. #include <assert.h>
  6. #include <functional>
  7. #include <inttypes.h>
  8. #include <math.h>
  9. #include <memory>
  10. #include <stdio.h>
  11. #include <string>
  12. #include <vector>
  13. #define MAX_ALIGNMENT 64
  14. #define QK 32
  15. #define WARMUP 5
  16. #define ITERATIONS 10
  17. #define L1_SIZE 32*128
  18. #define L2_SIZE 32*2048
  19. #define L3_SIZE 32*20480
  20. #define MEM_SIZE 32*2048000
  21. struct quantize_perf_params {
  22. std::vector<std::string> include_types;
  23. std::vector<size_t> test_sizes;
  24. size_t alignment_offset = 0;
  25. bool op_quantize_row_q_reference = false;
  26. bool op_quantize_row_q = false;
  27. bool op_dequantize_row_q = false;
  28. bool op_quantize_row_q_dot = false;
  29. bool op_vec_dot_q = false;
  30. };
  31. #if defined(__x86_64__) || defined(__i386__)
  32. #include <x86intrin.h>
  33. inline int64_t cpu_cycles() {
  34. // Rough way to detect new-ish CPUs
  35. #ifdef __POPCNT__
  36. unsigned int dummy;
  37. return __rdtscp(&dummy);
  38. #else
  39. return __rdtsc();
  40. #endif
  41. }
  42. #else
  43. #define cpu_cycles() 0
  44. #endif
  45. // Generate synthetic data
  46. void generate_data(float offset, size_t n, float * dst) {
  47. for (size_t i = 0; i < n; i++) {
  48. dst[i] = 0.1 + 2*cosf(i + offset);
  49. }
  50. }
  51. float gigabytes_per_second(size_t bytes, int64_t usecs) {
  52. return bytes / (float) usecs * 1000000 / (1024*1024*1024);
  53. }
  54. void * align_with_offset(void * ptr, int offset) {
  55. size_t dummy_size = MAX_ALIGNMENT * 4;
  56. return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
  57. }
  58. void benchmark_function(size_t size, size_t q_size, std::function<size_t(void)> function) {
  59. int64_t min_time_us = INT64_MAX;
  60. int64_t total_time_us = 0;
  61. int64_t min_time_cycles = INT64_MAX;
  62. int64_t total_time_cycles = 0;
  63. for (int i = 0; i < WARMUP; i++) {
  64. function();
  65. }
  66. for (int i = 0; i < ITERATIONS; i++) {
  67. const int64_t start_time = ggml_time_us();
  68. const int64_t start_cycles = cpu_cycles();
  69. function();
  70. const int64_t end_cycles = cpu_cycles();
  71. const int64_t end_time = ggml_time_us();
  72. total_time_cycles += end_cycles - start_cycles;
  73. min_time_cycles = std::min(min_time_cycles, end_cycles - start_cycles);
  74. total_time_us += end_time - start_time;
  75. min_time_us = std::min(min_time_us, end_time - start_time);
  76. }
  77. printf(" min cycles/%d vals : %9.2f\n", QK, QK * min_time_cycles / (float) size);
  78. printf(" avg cycles/%d vals : %9.2f\n", QK, QK * total_time_cycles / (float) (size * ITERATIONS));
  79. printf(" float32 throughput : %9.2f GB/s\n", gigabytes_per_second(4 * size * ITERATIONS, total_time_us));
  80. printf(" quantized throughput : %9.2f GB/s\n", gigabytes_per_second(q_size * ITERATIONS, total_time_us));
  81. }
  82. int main(int argc, char * argv[]) {
  83. quantize_perf_params params {};
  84. // read command line
  85. bool invalid_param = false;
  86. std::string arg;
  87. for (int i = 1; i < argc; i++) {
  88. arg = argv[i];
  89. if (arg == "--size") {
  90. if (++i >= argc) {
  91. invalid_param = true;
  92. break;
  93. }
  94. size_t size = std::stoi(argv[i]);
  95. if (size % 32 != 0) {
  96. fprintf(stderr, "error: size %zu not divisible by 32\n", size);
  97. invalid_param = true;
  98. break;
  99. }
  100. params.test_sizes.push_back(size);
  101. } else if (arg == "-3") {
  102. // quick select sizes that probably fit in CPU caches
  103. params.test_sizes.push_back(L1_SIZE);
  104. params.test_sizes.push_back(L2_SIZE);
  105. params.test_sizes.push_back(L3_SIZE);
  106. } else if (arg == "-4") {
  107. // quick select cache sizes + memory
  108. params.test_sizes.push_back(L1_SIZE);
  109. params.test_sizes.push_back(L2_SIZE);
  110. params.test_sizes.push_back(L3_SIZE);
  111. params.test_sizes.push_back(MEM_SIZE);
  112. } else if (arg == "--op") {
  113. if (++i >= argc) {
  114. invalid_param = true;
  115. break;
  116. }
  117. std::string op {argv[i]};
  118. if (op == "quantize_row_q_reference") {
  119. params.op_quantize_row_q_reference = true;
  120. } else if (op == "quantize_row_q") {
  121. params.op_quantize_row_q = true;
  122. } else if (op == "dequantize_row_q") {
  123. params.op_dequantize_row_q = true;
  124. } else if (op == "quantize_row_q_dot") {
  125. params.op_quantize_row_q_dot = true;
  126. } else if (op == "vec_dot_q") {
  127. params.op_vec_dot_q = true;
  128. } else {
  129. invalid_param = true;
  130. break;
  131. }
  132. } else if (arg == "--type") {
  133. if (++i >= argc) {
  134. invalid_param = true;
  135. break;
  136. }
  137. params.include_types.push_back(argv[i]);
  138. } else if (arg == "--alignment-offset") {
  139. if (++i >= argc) {
  140. invalid_param = true;
  141. break;
  142. }
  143. int alignment = std::stoi(argv[i]);
  144. if (alignment < 0 || alignment > MAX_ALIGNMENT) {
  145. fprintf(stderr, "error: aligment-offset must be less than %d\n", MAX_ALIGNMENT);
  146. invalid_param = true;
  147. break;
  148. }
  149. params.alignment_offset = alignment;
  150. } else {
  151. fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
  152. return 1;
  153. }
  154. }
  155. if (invalid_param) {
  156. fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
  157. return 1;
  158. }
  159. if (params.test_sizes.empty()) {
  160. params.test_sizes.push_back(L1_SIZE);
  161. }
  162. if (!(params.op_quantize_row_q_reference || params.op_quantize_row_q || params.op_dequantize_row_q || params.op_quantize_row_q_dot || params.op_vec_dot_q)) {
  163. params.op_quantize_row_q_reference = params.op_quantize_row_q = params.op_dequantize_row_q = params.op_quantize_row_q_dot = params.op_vec_dot_q = true;
  164. }
  165. std::sort(params.test_sizes.begin(), params.test_sizes.end());
  166. size_t largest = params.test_sizes.back();
  167. std::vector<uint8_t> test_data1_v(largest*4 + MAX_ALIGNMENT*2);
  168. std::vector<uint8_t> test_data2_v(largest*4 + MAX_ALIGNMENT*2);
  169. std::vector<uint8_t> test_q1_v(largest*4 + MAX_ALIGNMENT*2);
  170. std::vector<uint8_t> test_q2_v(largest*4 + MAX_ALIGNMENT*2);
  171. std::vector<uint8_t> test_out_v(largest*4 + MAX_ALIGNMENT*2);
  172. float * test_data1 = (float *) align_with_offset(test_data1_v.data(), params.alignment_offset);
  173. float * test_data2 = (float *) align_with_offset(test_data2_v.data(), params.alignment_offset);
  174. float * test_q1 = (float *) align_with_offset(test_q1_v.data(), params.alignment_offset);
  175. float * test_q2 = (float *) align_with_offset(test_q2_v.data(), params.alignment_offset);
  176. float * test_out = (float *) align_with_offset(test_out_v.data(), params.alignment_offset);
  177. generate_data(0, largest, test_data1);
  178. generate_data(1, largest, test_data2);
  179. // Initialize GGML, ensures float conversion tables are initialized
  180. struct ggml_init_params ggml_params = {
  181. /* .mem_size = */ 1*1024,
  182. /* .mem_buffer = */ NULL,
  183. /* .no_alloc = */ true,
  184. };
  185. struct ggml_context * ctx = ggml_init(ggml_params);
  186. for (int i = 0; i < GGML_TYPE_COUNT; i++) {
  187. ggml_type type = (ggml_type) i;
  188. quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
  189. if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
  190. continue;
  191. }
  192. if (qfns.quantize_row_q) {
  193. printf("%s\n", ggml_type_name(type));
  194. if (params.op_quantize_row_q_reference) {
  195. printf(" quantize_row_q_reference\n");
  196. for (size_t size : params.test_sizes) {
  197. printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
  198. auto quantize_fn = [&](void ) {
  199. qfns.quantize_row_q_reference(test_data1, test_q1, size);
  200. return test_q1[0];
  201. };
  202. size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
  203. benchmark_function(size, quantized_size, quantize_fn);
  204. }
  205. printf("\n");
  206. }
  207. if (params.op_quantize_row_q) {
  208. printf(" quantize_row_q\n");
  209. for (size_t size : params.test_sizes) {
  210. printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
  211. auto quantize_fn = [&](void ) {
  212. qfns.quantize_row_q(test_data1, test_q1, size);
  213. return test_q1[0];
  214. };
  215. size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
  216. benchmark_function(size, quantized_size, quantize_fn);
  217. }
  218. printf("\n");
  219. }
  220. if (params.op_dequantize_row_q) {
  221. printf(" dequantize_row_q\n");
  222. qfns.quantize_row_q(test_data1, test_q1, largest);
  223. for (size_t size : params.test_sizes) {
  224. printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
  225. auto quantize_fn = [&](void ) {
  226. qfns.dequantize_row_q(test_q1, test_out, size);
  227. return test_out[0];
  228. };
  229. size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
  230. benchmark_function(size, quantized_size, quantize_fn);
  231. }
  232. printf("\n");
  233. }
  234. if (params.op_quantize_row_q_dot) {
  235. printf(" quantize_row_q_dot\n");
  236. for (size_t size : params.test_sizes) {
  237. printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
  238. auto quantize_fn = [&](void ) {
  239. qfns.quantize_row_q_dot(test_data1, test_q1, size);
  240. return test_q1[0];
  241. };
  242. size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
  243. benchmark_function(size, quantized_size, quantize_fn);
  244. }
  245. printf("\n");
  246. }
  247. if (params.op_vec_dot_q) {
  248. printf(" vec_dot_q\n");
  249. qfns.quantize_row_q(test_data1, test_q1, largest);
  250. qfns.quantize_row_q(test_data2, test_q2, largest);
  251. for (size_t size : params.test_sizes) {
  252. printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
  253. auto quantize_fn = [&](void ) {
  254. float result;
  255. qfns.vec_dot_q(size, &result, test_q1, test_q2);
  256. return result;
  257. };
  258. size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
  259. benchmark_function(size, quantized_size, quantize_fn);
  260. }
  261. printf("\n");
  262. }
  263. }
  264. }
  265. ggml_free(ctx);
  266. return 0;
  267. }