1
0

gen-unicode-data.py 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. import regex
  2. def cpt_to_utf8_str(cpt):
  3. if cpt <= 0xFF:
  4. return bytes([cpt, 0, 0, 0])
  5. elif cpt <= 0xFFFF:
  6. return bytes([cpt & 0xFF, cpt >> 8, 0, 0])
  7. elif cpt <= 0xFFFFFF:
  8. return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, 0])
  9. else:
  10. return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24])
  11. def is_match(codepoint, regex_expr):
  12. try:
  13. res = regex.match(regex_expr, cpt_to_utf8_str(codepoint).decode('utf-32'))
  14. return res is not None
  15. except Exception:
  16. return False
  17. def get_matches(regex_expr):
  18. unicode_ranges = []
  19. current_range = None
  20. for codepoint in range(0x110000):
  21. if is_match(codepoint, regex_expr):
  22. if current_range is None:
  23. current_range = [codepoint, codepoint]
  24. else:
  25. current_range[1] = codepoint
  26. elif current_range is not None:
  27. unicode_ranges.append(tuple(current_range))
  28. current_range = None
  29. if current_range is not None:
  30. unicode_ranges.append(tuple(current_range))
  31. return unicode_ranges
  32. def print_cat(cat, ranges):
  33. print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat)) # noqa: NP100
  34. cnt = 0
  35. for start, end in ranges:
  36. if cnt % 4 != 0:
  37. print(" ", end="") # noqa: NP100
  38. print("{{0x{:08X}, 0x{:08X}}},".format(start, end), end="") # noqa: NP100
  39. if cnt % 4 == 3:
  40. print("") # noqa: NP100
  41. cnt += 1
  42. if cnt % 4 != 0:
  43. print("") # noqa: NP100
  44. print("};") # noqa: NP100
  45. print("") # noqa: NP100
  46. print_cat("number", get_matches(r'\p{N}'))
  47. print_cat("letter", get_matches(r'\p{L}'))
  48. print_cat("whitespace", get_matches(r'\p{Z}'))
  49. print_cat("accent_mark", get_matches(r'\p{M}'))
  50. print_cat("punctuation", get_matches(r'\p{P}'))
  51. print_cat("symbol", get_matches(r'\p{S}'))
  52. print_cat("control", get_matches(r'\p{C}'))