cpu-feats-x86.cpp 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323
  1. #include "ggml-backend-impl.h"
  2. #if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
  3. #ifdef _MSC_VER
  4. #include <intrin.h>
  5. #endif
  6. #include <cstring>
  7. #include <vector>
  8. #include <bitset>
  9. #include <array>
  10. #include <string>
  11. // ref: https://cdrdv2-public.intel.com/782156/325383-sdm-vol-2abcd.pdf
  12. struct cpuid_x86 {
  13. bool SSE3(void) { return f_1_ecx[0]; }
  14. bool PCLMULQDQ(void) { return f_1_ecx[1]; }
  15. bool MONITOR(void) { return f_1_ecx[3]; }
  16. bool SSSE3(void) { return f_1_ecx[9]; }
  17. bool FMA(void) { return f_1_ecx[12]; }
  18. bool CMPXCHG16B(void) { return f_1_ecx[13]; }
  19. bool SSE41(void) { return f_1_ecx[19]; }
  20. bool SSE42(void) { return f_1_ecx[20]; }
  21. bool MOVBE(void) { return f_1_ecx[22]; }
  22. bool POPCNT(void) { return f_1_ecx[23]; }
  23. bool AES(void) { return f_1_ecx[25]; }
  24. bool XSAVE(void) { return f_1_ecx[26]; }
  25. bool OSXSAVE(void) { return f_1_ecx[27]; }
  26. bool AVX(void) { return f_1_ecx[28]; }
  27. bool F16C(void) { return f_1_ecx[29]; }
  28. bool RDRAND(void) { return f_1_ecx[30]; }
  29. bool MSR(void) { return f_1_edx[5]; }
  30. bool CX8(void) { return f_1_edx[8]; }
  31. bool SEP(void) { return f_1_edx[11]; }
  32. bool CMOV(void) { return f_1_edx[15]; }
  33. bool CLFSH(void) { return f_1_edx[19]; }
  34. bool MMX(void) { return f_1_edx[23]; }
  35. bool FXSR(void) { return f_1_edx[24]; }
  36. bool SSE(void) { return f_1_edx[25]; }
  37. bool SSE2(void) { return f_1_edx[26]; }
  38. bool FSGSBASE(void) { return f_7_ebx[0]; }
  39. bool BMI1(void) { return f_7_ebx[3]; }
  40. bool HLE(void) { return is_intel && f_7_ebx[4]; }
  41. bool AVX2(void) { return f_7_ebx[5]; }
  42. bool BMI2(void) { return f_7_ebx[8]; }
  43. bool ERMS(void) { return f_7_ebx[9]; }
  44. bool INVPCID(void) { return f_7_ebx[10]; }
  45. bool RTM(void) { return is_intel && f_7_ebx[11]; }
  46. bool AVX512F(void) { return f_7_ebx[16]; }
  47. bool AVX512DQ(void) { return f_7_ebx[17]; }
  48. bool RDSEED(void) { return f_7_ebx[18]; }
  49. bool ADX(void) { return f_7_ebx[19]; }
  50. bool AVX512PF(void) { return f_7_ebx[26]; }
  51. bool AVX512ER(void) { return f_7_ebx[27]; }
  52. bool AVX512CD(void) { return f_7_ebx[28]; }
  53. bool AVX512BW(void) { return f_7_ebx[30]; }
  54. bool AVX512VL(void) { return f_7_ebx[31]; }
  55. bool SHA(void) { return f_7_ebx[29]; }
  56. bool PREFETCHWT1(void) { return f_7_ecx[0]; }
  57. bool LAHF(void) { return f_81_ecx[0]; }
  58. bool LZCNT(void) { return is_intel && f_81_ecx[5]; }
  59. bool ABM(void) { return is_amd && f_81_ecx[5]; }
  60. bool SSE4a(void) { return is_amd && f_81_ecx[6]; }
  61. bool XOP(void) { return is_amd && f_81_ecx[11]; }
  62. bool TBM(void) { return is_amd && f_81_ecx[21]; }
  63. bool SYSCALL(void) { return is_intel && f_81_edx[11]; }
  64. bool MMXEXT(void) { return is_amd && f_81_edx[22]; }
  65. bool RDTSCP(void) { return is_intel && f_81_edx[27]; }
  66. bool _3DNOWEXT(void) { return is_amd && f_81_edx[30]; }
  67. bool _3DNOW(void) { return is_amd && f_81_edx[31]; }
  68. bool AVX512_VBMI(void) { return f_7_ecx[1]; }
  69. bool AVX512_VNNI(void) { return f_7_ecx[11]; }
  70. bool AVX512_FP16(void) { return f_7_edx[23]; }
  71. bool AVX512_BF16(void) { return f_7_1_eax[5]; }
  72. bool AVX_VNNI(void) { return f_7_1_eax[4]; }
  73. bool AMX_TILE(void) { return f_7_edx[24]; }
  74. bool AMX_INT8(void) { return f_7_edx[25]; }
  75. bool AMX_FP16(void) { return f_7_1_eax[21]; }
  76. bool AMX_BF16(void) { return f_7_edx[22]; }
  77. #ifdef _MSC_VER
  78. static void cpuid(int cpu_info[4], int eax) {
  79. __cpuid(cpu_info, eax);
  80. }
  81. static void cpuidex(int cpu_info[4], int eax, int ecx) {
  82. __cpuidex(cpu_info, eax, ecx);
  83. }
  84. #else
  85. static void cpuid(int cpu_info[4], int eax) {
  86. __asm__ __volatile__(
  87. "cpuid"
  88. : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
  89. : "a"(eax), "c"(0));
  90. }
  91. static void cpuidex(int cpu_info[4], int eax, int ecx) {
  92. __asm__ __volatile__(
  93. "cpuid"
  94. : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
  95. : "a"(eax), "c"(ecx));
  96. }
  97. #endif
  98. cpuid_x86() {
  99. std::array<int, 4> cpui;
  100. std::vector<std::array<int, 4>> data;
  101. // calling __cpuid with 0x0 as the function_id argument
  102. // gets the number of the highest valid function ID.
  103. cpuid(cpui.data(), 0);
  104. int n_ids = cpui[0];
  105. for (int i = 0; i <= n_ids; ++i) {
  106. cpuidex(cpui.data(), i, 0);
  107. data.push_back(cpui);
  108. }
  109. // capture vendor string
  110. char vendor[0x20] = {};
  111. *reinterpret_cast<int *>(vendor) = data[0][1];
  112. *reinterpret_cast<int *>(vendor + 4) = data[0][3];
  113. *reinterpret_cast<int *>(vendor + 8) = data[0][2];
  114. this->vendor = vendor;
  115. if (this->vendor == "GenuineIntel") {
  116. is_intel = true;
  117. } else if (this->vendor == "AuthenticAMD") {
  118. is_amd = true;
  119. }
  120. // load bitset with flags for function 0x00000001
  121. if (n_ids >= 1) {
  122. f_1_ecx = data[1][2];
  123. f_1_edx = data[1][3];
  124. }
  125. // load bitset with flags for function 0x00000007
  126. if (n_ids >= 7) {
  127. f_7_ebx = data[7][1];
  128. f_7_ecx = data[7][2];
  129. f_7_edx = data[7][3];
  130. cpuidex(cpui.data(), 7, 1);
  131. f_7_1_eax = cpui[0];
  132. }
  133. // calling __cpuid with 0x80000000 as the function_id argument
  134. // gets the number of the highest valid extended ID.
  135. cpuid(cpui.data(), 0x80000000);
  136. unsigned int n_ex_ids = cpui[0];
  137. std::vector<std::array<int, 4>> ext_data;
  138. for (unsigned int i = 0x80000000; i <= n_ex_ids; ++i) {
  139. cpuidex(cpui.data(), i, 0);
  140. ext_data.push_back(cpui);
  141. }
  142. // load bitset with flags for function 0x80000001
  143. if (n_ex_ids >= 0x80000001) {
  144. f_81_ecx = ext_data[1][2];
  145. f_81_edx = ext_data[1][3];
  146. }
  147. // interpret CPU brand string if reported
  148. char brand[0x40] = {};
  149. if (n_ex_ids >= 0x80000004) {
  150. std::memcpy(brand, ext_data[2].data(), sizeof(cpui));
  151. std::memcpy(brand + 16, ext_data[3].data(), sizeof(cpui));
  152. std::memcpy(brand + 32, ext_data[4].data(), sizeof(cpui));
  153. this->brand = brand;
  154. }
  155. }
  156. bool is_intel = false;
  157. bool is_amd = false;
  158. std::string vendor;
  159. std::string brand;
  160. std::bitset<32> f_1_ecx;
  161. std::bitset<32> f_1_edx;
  162. std::bitset<32> f_7_ebx;
  163. std::bitset<32> f_7_ecx;
  164. std::bitset<32> f_7_edx;
  165. std::bitset<32> f_7_1_eax;
  166. std::bitset<32> f_81_ecx;
  167. std::bitset<32> f_81_edx;
  168. };
  169. #if 0
  170. void test_x86_is() {
  171. cpuid_x86 is;
  172. printf("CPU Vendor: %s\n", is.vendor.c_str());
  173. printf("Brand: %s\n", is.brand.c_str());
  174. printf("is_intel: %d\n", is.is_intel);
  175. printf("is_amd: %d\n", is.is_amd);
  176. printf("sse3: %d\n", is.SSE3());
  177. printf("pclmulqdq: %d\n", is.PCLMULQDQ());
  178. printf("ssse3: %d\n", is.SSSE3());
  179. printf("fma: %d\n", is.FMA());
  180. printf("cmpxchg16b: %d\n", is.CMPXCHG16B());
  181. printf("sse41: %d\n", is.SSE41());
  182. printf("sse42: %d\n", is.SSE42());
  183. printf("movbe: %d\n", is.MOVBE());
  184. printf("popcnt: %d\n", is.POPCNT());
  185. printf("aes: %d\n", is.AES());
  186. printf("xsave: %d\n", is.XSAVE());
  187. printf("osxsave: %d\n", is.OSXSAVE());
  188. printf("avx: %d\n", is.AVX());
  189. printf("f16c: %d\n", is.F16C());
  190. printf("rdrand: %d\n", is.RDRAND());
  191. printf("msr: %d\n", is.MSR());
  192. printf("cx8: %d\n", is.CX8());
  193. printf("sep: %d\n", is.SEP());
  194. printf("cmov: %d\n", is.CMOV());
  195. printf("clflush: %d\n", is.CLFSH());
  196. printf("mmx: %d\n", is.MMX());
  197. printf("fxsr: %d\n", is.FXSR());
  198. printf("sse: %d\n", is.SSE());
  199. printf("sse2: %d\n", is.SSE2());
  200. printf("fsgsbase: %d\n", is.FSGSBASE());
  201. printf("bmi1: %d\n", is.BMI1());
  202. printf("hle: %d\n", is.HLE());
  203. printf("avx2: %d\n", is.AVX2());
  204. printf("bmi2: %d\n", is.BMI2());
  205. printf("erms: %d\n", is.ERMS());
  206. printf("invpcid: %d\n", is.INVPCID());
  207. printf("rtm: %d\n", is.RTM());
  208. printf("avx512f: %d\n", is.AVX512F());
  209. printf("rdseed: %d\n", is.RDSEED());
  210. printf("adx: %d\n", is.ADX());
  211. printf("avx512pf: %d\n", is.AVX512PF());
  212. printf("avx512er: %d\n", is.AVX512ER());
  213. printf("avx512cd: %d\n", is.AVX512CD());
  214. printf("sha: %d\n", is.SHA());
  215. printf("prefetchwt1: %d\n", is.PREFETCHWT1());
  216. printf("lahf: %d\n", is.LAHF());
  217. printf("lzcnt: %d\n", is.LZCNT());
  218. printf("abm: %d\n", is.ABM());
  219. printf("sse4a: %d\n", is.SSE4a());
  220. printf("xop: %d\n", is.XOP());
  221. printf("tbm: %d\n", is.TBM());
  222. printf("syscall: %d\n", is.SYSCALL());
  223. printf("mmxext: %d\n", is.MMXEXT());
  224. printf("rdtscp: %d\n", is.RDTSCP());
  225. printf("3dnowext: %d\n", is._3DNOWEXT());
  226. printf("3dnow: %d\n", is._3DNOW());
  227. printf("avx512_vbmi: %d\n", is.AVX512_VBMI());
  228. printf("avx512_vnni: %d\n", is.AVX512_VNNI());
  229. printf("avx512_fp16: %d\n", is.AVX512_FP16());
  230. printf("avx512_bf16: %d\n", is.AVX512_BF16());
  231. printf("amx_tile: %d\n", is.AMX_TILE());
  232. printf("amx_int8: %d\n", is.AMX_INT8());
  233. printf("amx_fp16: %d\n", is.AMX_FP16());
  234. printf("amx_bf16: %d\n", is.AMX_BF16());
  235. }
  236. #endif
  237. static int ggml_backend_cpu_x86_score() {
  238. // FIXME: this does not check for OS support
  239. int score = 0;
  240. cpuid_x86 is;
  241. #ifdef GGML_FMA
  242. if (!is.FMA()) { return 0; }
  243. score += 1;
  244. #endif
  245. #ifdef GGML_F16C
  246. if (!is.F16C()) { return 0; }
  247. score += 1<<1;
  248. #endif
  249. #ifdef GGML_SSE42
  250. if (!is.SSE42()) { return 0; }
  251. score += 1<<2;
  252. #endif
  253. #ifdef GGML_AVX
  254. if (!is.AVX()) { return 0; }
  255. score += 1<<4;
  256. #endif
  257. #ifdef GGML_AVX2
  258. if (!is.AVX2()) { return 0; }
  259. score += 1<<5;
  260. #endif
  261. #ifdef GGML_AVX_VNNI
  262. if (!is.AVX_VNNI()) { return 0; }
  263. score += 1<<6;
  264. #endif
  265. #ifdef GGML_AVX512
  266. if (!is.AVX512F()) { return 0; }
  267. if (!is.AVX512CD()) { return 0; }
  268. if (!is.AVX512VL()) { return 0; }
  269. if (!is.AVX512DQ()) { return 0; }
  270. if (!is.AVX512BW()) { return 0; }
  271. score += 1<<7;
  272. #endif
  273. #ifdef GGML_AVX512_VBMI
  274. if (!is.AVX512_VBMI()) { return 0; }
  275. score += 1<<8;
  276. #endif
  277. #ifdef GGML_AVX512_BF16
  278. if (!is.AVX512_BF16()) { return 0; }
  279. score += 1<<9;
  280. #endif
  281. #ifdef GGML_AVX512_VNNI
  282. if (!is.AVX512_VNNI()) { return 0; }
  283. score += 1<<10;
  284. #endif
  285. #ifdef GGML_AMX_INT8
  286. if (!is.AMX_INT8()) { return 0; }
  287. score += 1<<11;
  288. #endif
  289. return score;
  290. }
  291. GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_x86_score)
  292. #endif // defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))