gen-unicode-data.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. import regex
  2. import ctypes
  3. import unicodedata
  4. class CoodepointFlags (ctypes.Structure):
  5. _fields_ = [ # see definition in unicode.h
  6. ("is_undefined", ctypes.c_uint16, 1),
  7. ("is_number", ctypes.c_uint16, 1), # regex: \p{N}
  8. ("is_letter", ctypes.c_uint16, 1), # regex: \p{L}
  9. ("is_separator", ctypes.c_uint16, 1), # regex: \p{Z}
  10. ("is_accent_mark", ctypes.c_uint16, 1), # regex: \p{M}
  11. ("is_punctuation", ctypes.c_uint16, 1), # regex: \p{P}
  12. ("is_symbol", ctypes.c_uint16, 1), # regex: \p{S}
  13. ("is_control", ctypes.c_uint16, 1), # regex: \p{C}
  14. ]
  15. assert (ctypes.sizeof(CoodepointFlags) == 2)
  16. MAX_CODEPOINTS = 0x110000
  17. regex_number = regex.compile(r'\p{N}')
  18. regex_letter = regex.compile(r'\p{L}')
  19. regex_separator = regex.compile(r'\p{Z}')
  20. regex_accent_mark = regex.compile(r'\p{M}')
  21. regex_punctuation = regex.compile(r'\p{P}')
  22. regex_symbol = regex.compile(r'\p{S}')
  23. regex_control = regex.compile(r'\p{C}')
  24. regex_whitespace = regex.compile(r'\s')
  25. codepoint_flags = (CoodepointFlags * MAX_CODEPOINTS)()
  26. table_whitespace = []
  27. table_lowercase = []
  28. table_uppercase = []
  29. table_nfd = []
  30. for codepoint in range(MAX_CODEPOINTS):
  31. # convert codepoint to unicode character
  32. char = chr(codepoint)
  33. # regex categories
  34. flags = codepoint_flags[codepoint]
  35. flags.is_number = bool(regex_number.match(char))
  36. flags.is_letter = bool(regex_letter.match(char))
  37. flags.is_separator = bool(regex_separator.match(char))
  38. flags.is_accent_mark = bool(regex_accent_mark.match(char))
  39. flags.is_punctuation = bool(regex_punctuation.match(char))
  40. flags.is_symbol = bool(regex_symbol.match(char))
  41. flags.is_control = bool(regex_control.match(char))
  42. flags.is_undefined = bytes(flags)[0] == 0
  43. assert (not flags.is_undefined)
  44. # whitespaces
  45. if bool(regex_whitespace.match(char)):
  46. table_whitespace.append(codepoint)
  47. # lowercase conversion
  48. lower = ord(char.lower()[0])
  49. if codepoint != lower:
  50. table_lowercase.append((codepoint, lower))
  51. # uppercase conversion
  52. upper = ord(char.upper()[0])
  53. if codepoint != upper:
  54. table_uppercase.append((codepoint, upper))
  55. # NFD normalization
  56. norm = ord(unicodedata.normalize('NFD', char)[0])
  57. if codepoint != norm:
  58. table_nfd.append((codepoint, norm))
  59. # group ranges with same flags
  60. ranges_flags = [(0, codepoint_flags[0])] # start, flags
  61. for codepoint, flags in enumerate(codepoint_flags):
  62. if bytes(flags) != bytes(ranges_flags[-1][1]):
  63. ranges_flags.append((codepoint, flags))
  64. ranges_flags.append((MAX_CODEPOINTS, CoodepointFlags()))
  65. # group ranges with same nfd
  66. ranges_nfd = [(0, 0, 0)] # start, last, nfd
  67. for codepoint, norm in table_nfd:
  68. start = ranges_nfd[-1][0]
  69. if ranges_nfd[-1] != (start, codepoint - 1, norm):
  70. ranges_nfd.append(None)
  71. start = codepoint
  72. ranges_nfd[-1] = (start, codepoint, norm)
  73. # Generate 'unicode-data.cpp'
  74. def out(line=""):
  75. print(line, end='\n') # noqa
  76. out("""\
  77. // generated with scripts/gen-unicode-data.py
  78. #include "unicode-data.h"
  79. #include <cstdint>
  80. #include <vector>
  81. #include <unordered_map>
  82. #include <unordered_set>
  83. """)
  84. out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1")
  85. for codepoint, flags in ranges_flags:
  86. flags = int.from_bytes(bytes(flags), "little")
  87. out("{0x%06X, 0x%04X}," % (codepoint, flags))
  88. out("};\n")
  89. out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
  90. out(", ".join("0x%06X" % cpt for cpt in table_whitespace))
  91. out("};\n")
  92. out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
  93. for tuple in table_lowercase:
  94. out("{0x%06X, 0x%06X}," % tuple)
  95. out("};\n")
  96. out("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {")
  97. for tuple in table_uppercase:
  98. out("{0x%06X, 0x%06X}," % tuple)
  99. out("};\n")
  100. out("const std::vector<range_nfd> unicode_ranges_nfd = { // start, last, nfd")
  101. for triple in ranges_nfd:
  102. out("{0x%06X, 0x%06X, 0x%06X}," % triple)
  103. out("};\n")