1
0

vdot.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311
  1. #include <cstdio>
  2. #include <vector>
  3. #include <random>
  4. #include <chrono>
  5. #include <cstdlib>
  6. #include <cmath>
  7. #include <cassert>
  8. #include <cstring>
  9. #include <array>
  10. #include <ggml.h>
  11. #include <ggml-cpu.h>
  12. #if defined(_MSC_VER)
  13. #pragma warning(disable: 4244 4267) // possible loss of data
  14. #endif
  15. constexpr int kVecSize = 1 << 18;
  16. static float drawFromGaussianPdf(std::mt19937& rndm) {
  17. constexpr double kScale = 1./(1. + std::mt19937::max());
  18. constexpr double kTwoPiTimesScale = 6.28318530717958647692*kScale;
  19. static float lastX;
  20. static bool haveX = false;
  21. if (haveX) { haveX = false; return lastX; }
  22. auto r = sqrt(-2*log(1 - kScale*rndm()));
  23. auto phi = kTwoPiTimesScale * rndm();
  24. lastX = r*sin(phi);
  25. haveX = true;
  26. return r*cos(phi);
  27. }
  28. static void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = 0) {
  29. for (auto& v : values) v = mean + drawFromGaussianPdf(rndm);
  30. }
  31. // Copy-pasted from ggml.c
  32. #define QK4_0 32
  33. typedef struct {
  34. float d; // delta
  35. uint8_t qs[QK4_0 / 2]; // nibbles / quants
  36. } block_q4_0;
  37. static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
  38. #define QK4_1 32
  39. typedef struct {
  40. float d; // delta
  41. float m; // min
  42. uint8_t qs[QK4_1 / 2]; // nibbles / quants
  43. } block_q4_1;
  44. static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
  45. // Copy-pasted from ggml.c
  46. #define QK8_0 32
  47. typedef struct {
  48. float d; // delta
  49. int8_t qs[QK8_0]; // quants
  50. } block_q8_0;
  51. static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
  52. // "Scalar" dot product between the quantized vector x and float vector y
  53. inline double dot(int n, const block_q4_0* x, const float* y) {
  54. const static float kValues[16] = {-8.f, -7.f, -6.f, -5.f, -4.f, -3.f, -2.f, -1.f, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f};
  55. constexpr uint32_t kMask1 = 0x0f0f0f0f;
  56. uint32_t u1, u2;
  57. auto q1 = (const uint8_t*)&u1;
  58. auto q2 = (const uint8_t*)&u2;
  59. double sum = 0;
  60. for (int i=0; i<n; ++i) {
  61. float d = x->d;
  62. auto u = (const uint32_t*)x->qs;
  63. float s = 0;
  64. for (int k=0; k<4; ++k) {
  65. u1 = u[k] & kMask1;
  66. u2 = (u[k] >> 4) & kMask1;
  67. s += y[0]*kValues[q1[0]] + y[1]*kValues[q2[0]] +
  68. y[2]*kValues[q1[1]] + y[3]*kValues[q2[1]] +
  69. y[4]*kValues[q1[2]] + y[5]*kValues[q2[2]] +
  70. y[6]*kValues[q1[3]] + y[7]*kValues[q2[3]];
  71. y += 8;
  72. }
  73. sum += s*d;
  74. ++x;
  75. }
  76. return sum;
  77. }
  78. // Alternative version of the above. Faster on my Mac (~45 us vs ~55 us per dot product),
  79. // but about the same on X86_64 (Ryzen 7950X CPU).
  80. inline double dot3(int n, const block_q4_0* x, const float* y) {
  81. const static std::pair<float,float> kValues[256] = {
  82. {-8.f, -8.f}, {-7.f, -8.f}, {-6.f, -8.f}, {-5.f, -8.f}, {-4.f, -8.f}, {-3.f, -8.f}, {-2.f, -8.f}, {-1.f, -8.f},
  83. { 0.f, -8.f}, { 1.f, -8.f}, { 2.f, -8.f}, { 3.f, -8.f}, { 4.f, -8.f}, { 5.f, -8.f}, { 6.f, -8.f}, { 7.f, -8.f},
  84. {-8.f, -7.f}, {-7.f, -7.f}, {-6.f, -7.f}, {-5.f, -7.f}, {-4.f, -7.f}, {-3.f, -7.f}, {-2.f, -7.f}, {-1.f, -7.f},
  85. { 0.f, -7.f}, { 1.f, -7.f}, { 2.f, -7.f}, { 3.f, -7.f}, { 4.f, -7.f}, { 5.f, -7.f}, { 6.f, -7.f}, { 7.f, -7.f},
  86. {-8.f, -6.f}, {-7.f, -6.f}, {-6.f, -6.f}, {-5.f, -6.f}, {-4.f, -6.f}, {-3.f, -6.f}, {-2.f, -6.f}, {-1.f, -6.f},
  87. { 0.f, -6.f}, { 1.f, -6.f}, { 2.f, -6.f}, { 3.f, -6.f}, { 4.f, -6.f}, { 5.f, -6.f}, { 6.f, -6.f}, { 7.f, -6.f},
  88. {-8.f, -5.f}, {-7.f, -5.f}, {-6.f, -5.f}, {-5.f, -5.f}, {-4.f, -5.f}, {-3.f, -5.f}, {-2.f, -5.f}, {-1.f, -5.f},
  89. { 0.f, -5.f}, { 1.f, -5.f}, { 2.f, -5.f}, { 3.f, -5.f}, { 4.f, -5.f}, { 5.f, -5.f}, { 6.f, -5.f}, { 7.f, -5.f},
  90. {-8.f, -4.f}, {-7.f, -4.f}, {-6.f, -4.f}, {-5.f, -4.f}, {-4.f, -4.f}, {-3.f, -4.f}, {-2.f, -4.f}, {-1.f, -4.f},
  91. { 0.f, -4.f}, { 1.f, -4.f}, { 2.f, -4.f}, { 3.f, -4.f}, { 4.f, -4.f}, { 5.f, -4.f}, { 6.f, -4.f}, { 7.f, -4.f},
  92. {-8.f, -3.f}, {-7.f, -3.f}, {-6.f, -3.f}, {-5.f, -3.f}, {-4.f, -3.f}, {-3.f, -3.f}, {-2.f, -3.f}, {-1.f, -3.f},
  93. { 0.f, -3.f}, { 1.f, -3.f}, { 2.f, -3.f}, { 3.f, -3.f}, { 4.f, -3.f}, { 5.f, -3.f}, { 6.f, -3.f}, { 7.f, -3.f},
  94. {-8.f, -2.f}, {-7.f, -2.f}, {-6.f, -2.f}, {-5.f, -2.f}, {-4.f, -2.f}, {-3.f, -2.f}, {-2.f, -2.f}, {-1.f, -2.f},
  95. { 0.f, -2.f}, { 1.f, -2.f}, { 2.f, -2.f}, { 3.f, -2.f}, { 4.f, -2.f}, { 5.f, -2.f}, { 6.f, -2.f}, { 7.f, -2.f},
  96. {-8.f, -1.f}, {-7.f, -1.f}, {-6.f, -1.f}, {-5.f, -1.f}, {-4.f, -1.f}, {-3.f, -1.f}, {-2.f, -1.f}, {-1.f, -1.f},
  97. { 0.f, -1.f}, { 1.f, -1.f}, { 2.f, -1.f}, { 3.f, -1.f}, { 4.f, -1.f}, { 5.f, -1.f}, { 6.f, -1.f}, { 7.f, -1.f},
  98. {-8.f, 0.f}, {-7.f, 0.f}, {-6.f, 0.f}, {-5.f, 0.f}, {-4.f, 0.f}, {-3.f, 0.f}, {-2.f, 0.f}, {-1.f, 0.f},
  99. { 0.f, 0.f}, { 1.f, 0.f}, { 2.f, 0.f}, { 3.f, 0.f}, { 4.f, 0.f}, { 5.f, 0.f}, { 6.f, 0.f}, { 7.f, 0.f},
  100. {-8.f, 1.f}, {-7.f, 1.f}, {-6.f, 1.f}, {-5.f, 1.f}, {-4.f, 1.f}, {-3.f, 1.f}, {-2.f, 1.f}, {-1.f, 1.f},
  101. { 0.f, 1.f}, { 1.f, 1.f}, { 2.f, 1.f}, { 3.f, 1.f}, { 4.f, 1.f}, { 5.f, 1.f}, { 6.f, 1.f}, { 7.f, 1.f},
  102. {-8.f, 2.f}, {-7.f, 2.f}, {-6.f, 2.f}, {-5.f, 2.f}, {-4.f, 2.f}, {-3.f, 2.f}, {-2.f, 2.f}, {-1.f, 2.f},
  103. { 0.f, 2.f}, { 1.f, 2.f}, { 2.f, 2.f}, { 3.f, 2.f}, { 4.f, 2.f}, { 5.f, 2.f}, { 6.f, 2.f}, { 7.f, 2.f},
  104. {-8.f, 3.f}, {-7.f, 3.f}, {-6.f, 3.f}, {-5.f, 3.f}, {-4.f, 3.f}, {-3.f, 3.f}, {-2.f, 3.f}, {-1.f, 3.f},
  105. { 0.f, 3.f}, { 1.f, 3.f}, { 2.f, 3.f}, { 3.f, 3.f}, { 4.f, 3.f}, { 5.f, 3.f}, { 6.f, 3.f}, { 7.f, 3.f},
  106. {-8.f, 4.f}, {-7.f, 4.f}, {-6.f, 4.f}, {-5.f, 4.f}, {-4.f, 4.f}, {-3.f, 4.f}, {-2.f, 4.f}, {-1.f, 4.f},
  107. { 0.f, 4.f}, { 1.f, 4.f}, { 2.f, 4.f}, { 3.f, 4.f}, { 4.f, 4.f}, { 5.f, 4.f}, { 6.f, 4.f}, { 7.f, 4.f},
  108. {-8.f, 5.f}, {-7.f, 5.f}, {-6.f, 5.f}, {-5.f, 5.f}, {-4.f, 5.f}, {-3.f, 5.f}, {-2.f, 5.f}, {-1.f, 5.f},
  109. { 0.f, 5.f}, { 1.f, 5.f}, { 2.f, 5.f}, { 3.f, 5.f}, { 4.f, 5.f}, { 5.f, 5.f}, { 6.f, 5.f}, { 7.f, 5.f},
  110. {-8.f, 6.f}, {-7.f, 6.f}, {-6.f, 6.f}, {-5.f, 6.f}, {-4.f, 6.f}, {-3.f, 6.f}, {-2.f, 6.f}, {-1.f, 6.f},
  111. { 0.f, 6.f}, { 1.f, 6.f}, { 2.f, 6.f}, { 3.f, 6.f}, { 4.f, 6.f}, { 5.f, 6.f}, { 6.f, 6.f}, { 7.f, 6.f},
  112. {-8.f, 7.f}, {-7.f, 7.f}, {-6.f, 7.f}, {-5.f, 7.f}, {-4.f, 7.f}, {-3.f, 7.f}, {-2.f, 7.f}, {-1.f, 7.f},
  113. { 0.f, 7.f}, { 1.f, 7.f}, { 2.f, 7.f}, { 3.f, 7.f}, { 4.f, 7.f}, { 5.f, 7.f}, { 6.f, 7.f}, { 7.f, 7.f}
  114. };
  115. double sum = 0;
  116. for (int i=0; i<n; ++i) {
  117. float d = x->d;
  118. auto q = x->qs;
  119. float s = 0;
  120. for (int k=0; k<4; ++k) {
  121. s += y[0]*kValues[q[0]].first + y[1]*kValues[q[0]].second +
  122. y[2]*kValues[q[1]].first + y[3]*kValues[q[1]].second +
  123. y[4]*kValues[q[2]].first + y[5]*kValues[q[2]].second +
  124. y[6]*kValues[q[3]].first + y[7]*kValues[q[3]].second;
  125. y += 8; q += 4;
  126. }
  127. sum += s*d;
  128. ++x;
  129. }
  130. return sum;
  131. }
  132. inline double dot41(int n, const block_q4_1* x, const float* y) {
  133. const static float kValues[16] = {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f};
  134. constexpr uint32_t kMask1 = 0x0f0f0f0f;
  135. uint32_t u1, u2;
  136. auto q1 = (const uint8_t*)&u1;
  137. auto q2 = (const uint8_t*)&u2;
  138. double sum = 0;
  139. for (int i=0; i<n; ++i) {
  140. auto u = (const uint32_t*)x->qs;
  141. float s = 0, s1 = 0;
  142. for (int k=0; k<4; ++k) {
  143. u1 = u[k] & kMask1;
  144. u2 = (u[k] >> 4) & kMask1;
  145. s += y[0]*kValues[q1[0]] + y[1]*kValues[q2[0]] +
  146. y[2]*kValues[q1[1]] + y[3]*kValues[q2[1]] +
  147. y[4]*kValues[q1[2]] + y[5]*kValues[q2[2]] +
  148. y[6]*kValues[q1[3]] + y[7]*kValues[q2[3]];
  149. s1 += y[0] + y[1] + y[2] + y[3] + y[4] + y[5] + y[6] + y[7];
  150. y += 8;
  151. }
  152. sum += s*x->d + s1*x->m;
  153. ++x;
  154. }
  155. return sum;
  156. }
  157. // Copy-pasted from ggml.c
  158. static void quantize_row_q8_0_reference(const float *x, block_q8_0 *y, int k) {
  159. assert(k % QK8_0 == 0);
  160. const int nb = k / QK8_0;
  161. for (int i = 0; i < nb; i++) {
  162. float amax = 0.0f; // absolute max
  163. for (int l = 0; l < QK8_0; l++) {
  164. const float v = x[i*QK8_0 + l];
  165. amax = std::max(amax, fabsf(v));
  166. }
  167. const float d = amax / ((1 << 7) - 1);
  168. const float id = d ? 1.0f/d : 0.0f;
  169. y[i].d = d;
  170. for (int l = 0; l < QK8_0; ++l) {
  171. const float v = x[i*QK8_0 + l]*id;
  172. y[i].qs[l] = roundf(v);
  173. }
  174. }
  175. }
  176. // Copy-pasted from ggml.c
  177. static void dot_q4_q8(const int n, float* s, const void* vx, const void* vy) {
  178. const int nb = n / QK8_0;
  179. const block_q4_0* x = (const block_q4_0*)vx;
  180. const block_q8_0* y = (const block_q8_0*)vy;
  181. float sumf = 0;
  182. for (int i = 0; i < nb; i++) {
  183. const float d0 = x[i].d;
  184. const float d1 = y[i].d;
  185. const uint8_t * p0 = x[i].qs;
  186. const int8_t * p1 = y[i].qs;
  187. int sumi = 0;
  188. for (int j = 0; j < QK8_0/2; j++) {
  189. const uint8_t v0 = p0[j];
  190. const int i0 = (int8_t) (v0 & 0xf) - 8;
  191. const int i1 = (int8_t) (v0 >> 4) - 8;
  192. const int i2 = p1[2*j + 0];
  193. const int i3 = p1[2*j + 1];
  194. sumi += i0*i2 + i1*i3;
  195. }
  196. sumf += d0*d1*sumi;
  197. }
  198. *s = sumf;
  199. }
  200. int main(int argc, char** argv) {
  201. int nloop = argc > 1 ? atoi(argv[1]) : 10;
  202. bool scalar = argc > 2 ? atoi(argv[2]) : false;
  203. bool useQ4_1 = argc > 3 ? atoi(argv[3]) : false;
  204. if (scalar && useQ4_1) {
  205. printf("It is not possible to use Q4_1 quantization and scalar implementations\n");
  206. return 1;
  207. }
  208. std::mt19937 rndm(1234);
  209. std::vector<float> x1(kVecSize), y1(kVecSize);
  210. int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
  211. int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
  212. const auto * funcs_cpu = ggml_get_type_traits_cpu(useQ4_1 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q4_0);
  213. std::vector<block_q4_0> q40;
  214. std::vector<block_q4_1> q41;
  215. if (useQ4_1) q41.resize(n4);
  216. else q40.resize(n4);
  217. std::vector<block_q8_0> q8(n8);
  218. double sumt = 0, sumt2 = 0, maxt = 0;
  219. double sumqt = 0, sumqt2 = 0, maxqt = 0;
  220. double sum = 0, sumq = 0, exactSum = 0;
  221. for (int iloop=0; iloop<nloop; ++iloop) {
  222. // Fill vector x with random numbers
  223. fillRandomGaussianFloats(x1, rndm);
  224. // Fill vector y with random numbers
  225. fillRandomGaussianFloats(y1, rndm);
  226. // Compute the exact dot product
  227. for (int k=0; k<kVecSize; ++k) exactSum += x1[k]*y1[k];
  228. // quantize x.
  229. // Note, we do not include this in the timing as in practical application
  230. // we already have the quantized model weights.
  231. if (useQ4_1) {
  232. funcs_cpu->from_float(x1.data(), q41.data(), kVecSize);
  233. } else {
  234. funcs_cpu->from_float(x1.data(), q40.data(), kVecSize);
  235. }
  236. // Now measure time the dot product needs using the "scalar" version above
  237. auto t1 = std::chrono::high_resolution_clock::now();
  238. if (useQ4_1) sum += dot41(kVecSize / QK4_1, q41.data(), y1.data());
  239. else sum += dot(kVecSize / QK4_0, q40.data(), y1.data());
  240. auto t2 = std::chrono::high_resolution_clock::now();
  241. auto t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
  242. sumt += t; sumt2 += t*t; maxt = std::max(maxt, t);
  243. // And now measure the time needed to quantize y and perform the dot product with the quantized y
  244. t1 = std::chrono::high_resolution_clock::now();
  245. float result;
  246. if (scalar) {
  247. quantize_row_q8_0_reference(y1.data(), q8.data(), kVecSize);
  248. dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
  249. }
  250. else {
  251. const auto * vdot = ggml_get_type_traits_cpu(funcs_cpu->vec_dot_type);
  252. vdot->from_float(y1.data(), q8.data(), kVecSize);
  253. if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
  254. else funcs_cpu->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
  255. }
  256. sumq += result;
  257. t2 = std::chrono::high_resolution_clock::now();
  258. t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
  259. sumqt += t; sumqt2 += t*t; maxqt = std::max(maxqt, t);
  260. }
  261. // Report the time (and the average of the dot products so the compiler does not come up with the idea
  262. // of optimizing away the function calls after figuring that the result is not used).
  263. sum /= nloop; sumq /= nloop;
  264. exactSum /= nloop;
  265. printf("Exact result: <dot> = %g\n",exactSum);
  266. printf("<dot> = %g, %g\n",sum,sumq);
  267. sumt /= nloop; sumt2 /= nloop; sumt2 -= sumt*sumt;
  268. if (sumt2 > 0) sumt2 = sqrt(sumt2);
  269. printf("time = %g +/- %g us. maxt = %g us\n",sumt,sumt2,maxt);
  270. sumqt /= nloop; sumqt2 /= nloop; sumqt2 -= sumqt*sumqt;
  271. if (sumqt2 > 0) sumqt2 = sqrt(sumqt2);
  272. printf("timeq = %g +/- %g us. maxt = %g us\n",sumqt,sumqt2,maxqt);
  273. return 0;
  274. }