gen-unicode-data.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. import array
  2. import unicodedata
  3. import requests
  4. MAX_CODEPOINTS = 0x110000
  5. UNICODE_DATA_URL = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
  6. # see https://www.unicode.org/L2/L1999/UnicodeData.html
  7. def unicode_data_iter():
  8. res = requests.get(UNICODE_DATA_URL)
  9. res.raise_for_status()
  10. data = res.content.decode()
  11. prev = []
  12. for line in data.splitlines():
  13. # ej: 0000;<control>;Cc;0;BN;;;;;N;NULL;;;;
  14. line = line.split(";")
  15. cpt = int(line[0], base=16)
  16. assert cpt < MAX_CODEPOINTS
  17. cpt_lower = int(line[-2] or "0", base=16)
  18. assert cpt_lower < MAX_CODEPOINTS
  19. cpt_upper = int(line[-3] or "0", base=16)
  20. assert cpt_upper < MAX_CODEPOINTS
  21. categ = line[2].strip()
  22. assert len(categ) == 2
  23. bidir = line[4].strip()
  24. assert len(categ) == 2
  25. name = line[1]
  26. if name.endswith(", First>"):
  27. prev = (cpt, cpt_lower, cpt_upper, categ, bidir)
  28. continue
  29. if name.endswith(", Last>"):
  30. assert prev[1:] == (0, 0, categ, bidir)
  31. for c in range(prev[0], cpt):
  32. yield (c, cpt_lower, cpt_upper, categ, bidir)
  33. yield (cpt, cpt_lower, cpt_upper, categ, bidir)
  34. # see definition in unicode.h
  35. CODEPOINT_FLAG_UNDEFINED = 0x0001 #
  36. CODEPOINT_FLAG_NUMBER = 0x0002 # \p{N}
  37. CODEPOINT_FLAG_LETTER = 0x0004 # \p{L}
  38. CODEPOINT_FLAG_SEPARATOR = 0x0008 # \p{Z}
  39. CODEPOINT_FLAG_MARK = 0x0010 # \p{M}
  40. CODEPOINT_FLAG_PUNCTUATION = 0x0020 # \p{P}
  41. CODEPOINT_FLAG_SYMBOL = 0x0040 # \p{S}
  42. CODEPOINT_FLAG_CONTROL = 0x0080 # \p{C}
  43. UNICODE_CATEGORY_TO_FLAG = {
  44. "Cn": CODEPOINT_FLAG_UNDEFINED, # Undefined
  45. "Cc": CODEPOINT_FLAG_CONTROL, # Control
  46. "Cf": CODEPOINT_FLAG_CONTROL, # Format
  47. "Co": CODEPOINT_FLAG_CONTROL, # Private Use
  48. "Cs": CODEPOINT_FLAG_CONTROL, # Surrrogate
  49. "Ll": CODEPOINT_FLAG_LETTER, # Lowercase Letter
  50. "Lm": CODEPOINT_FLAG_LETTER, # Modifier Letter
  51. "Lo": CODEPOINT_FLAG_LETTER, # Other Letter
  52. "Lt": CODEPOINT_FLAG_LETTER, # Titlecase Letter
  53. "Lu": CODEPOINT_FLAG_LETTER, # Uppercase Letter
  54. "L&": CODEPOINT_FLAG_LETTER, # Cased Letter
  55. "Mc": CODEPOINT_FLAG_MARK, # Spacing Mark
  56. "Me": CODEPOINT_FLAG_MARK, # Enclosing Mark
  57. "Mn": CODEPOINT_FLAG_MARK, # Nonspacing Mark
  58. "Nd": CODEPOINT_FLAG_NUMBER, # Decimal Number
  59. "Nl": CODEPOINT_FLAG_NUMBER, # Letter Number
  60. "No": CODEPOINT_FLAG_NUMBER, # Other Number
  61. "Pc": CODEPOINT_FLAG_PUNCTUATION, # Connector Punctuation
  62. "Pd": CODEPOINT_FLAG_PUNCTUATION, # Dash Punctuation
  63. "Pe": CODEPOINT_FLAG_PUNCTUATION, # Close Punctuation
  64. "Pf": CODEPOINT_FLAG_PUNCTUATION, # Final Punctuation
  65. "Pi": CODEPOINT_FLAG_PUNCTUATION, # Initial Punctuation
  66. "Po": CODEPOINT_FLAG_PUNCTUATION, # Other Punctuation
  67. "Ps": CODEPOINT_FLAG_PUNCTUATION, # Open Punctuation
  68. "Sc": CODEPOINT_FLAG_SYMBOL, # Currency Symbol
  69. "Sk": CODEPOINT_FLAG_SYMBOL, # Modifier Symbol
  70. "Sm": CODEPOINT_FLAG_SYMBOL, # Math Symbol
  71. "So": CODEPOINT_FLAG_SYMBOL, # Other Symbol
  72. "Zl": CODEPOINT_FLAG_SEPARATOR, # Line Separator
  73. "Zp": CODEPOINT_FLAG_SEPARATOR, # Paragraph Separator
  74. "Zs": CODEPOINT_FLAG_SEPARATOR, # Space Separator
  75. }
  76. codepoint_flags = array.array('H', [CODEPOINT_FLAG_UNDEFINED]) * MAX_CODEPOINTS
  77. table_whitespace = []
  78. table_lowercase = []
  79. table_uppercase = []
  80. table_nfd = []
  81. for (cpt, cpt_lower, cpt_upper, categ, bidir) in unicode_data_iter():
  82. # convert codepoint to unicode character
  83. char = chr(cpt)
  84. # codepoint category flags
  85. codepoint_flags[cpt] = UNICODE_CATEGORY_TO_FLAG[categ]
  86. # lowercase conversion
  87. if cpt_lower:
  88. table_lowercase.append((cpt, cpt_lower))
  89. # uppercase conversion
  90. if cpt_upper:
  91. table_uppercase.append((cpt, cpt_upper))
  92. # NFD normalization
  93. norm = ord(unicodedata.normalize('NFD', char)[0])
  94. if cpt != norm:
  95. table_nfd.append((cpt, norm))
  96. # whitespaces, see "<White_Space>" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
  97. table_whitespace.extend(range(0x0009, 0x000D + 1))
  98. table_whitespace.extend(range(0x2000, 0x200A + 1))
  99. table_whitespace.extend([0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000])
  100. # sort by codepoint
  101. table_whitespace.sort()
  102. table_lowercase.sort()
  103. table_uppercase.sort()
  104. table_nfd.sort()
  105. # group ranges with same flags
  106. ranges_flags = [(0, codepoint_flags[0])] # start, flags
  107. for codepoint, flags in enumerate(codepoint_flags):
  108. if flags != ranges_flags[-1][1]:
  109. ranges_flags.append((codepoint, flags))
  110. ranges_flags.append((MAX_CODEPOINTS, 0x0000))
  111. # group ranges with same nfd
  112. ranges_nfd = [(0, 0, 0)] # start, last, nfd
  113. for codepoint, norm in table_nfd:
  114. start = ranges_nfd[-1][0]
  115. if ranges_nfd[-1] != (start, codepoint - 1, norm):
  116. ranges_nfd.append(None)
  117. start = codepoint
  118. ranges_nfd[-1] = (start, codepoint, norm)
  119. # Generate 'unicode-data.cpp':
  120. # python ./scripts//gen-unicode-data.py > unicode-data.cpp
  121. def out(line=""):
  122. print(line, end='\n') # noqa
  123. out("""\
  124. // generated with scripts/gen-unicode-data.py
  125. #include "unicode-data.h"
  126. #include <cstdint>
  127. #include <vector>
  128. #include <unordered_map>
  129. #include <unordered_set>
  130. """)
  131. out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1")
  132. for codepoint, flags in ranges_flags:
  133. out("{0x%06X, 0x%04X}," % (codepoint, flags))
  134. out("};\n")
  135. out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
  136. for codepoint in table_whitespace:
  137. out("0x%06X," % codepoint)
  138. out("};\n")
  139. out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
  140. for tuple in table_lowercase:
  141. out("{0x%06X, 0x%06X}," % tuple)
  142. out("};\n")
  143. out("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {")
  144. for tuple in table_uppercase:
  145. out("{0x%06X, 0x%06X}," % tuple)
  146. out("};\n")
  147. out("const std::vector<range_nfd> unicode_ranges_nfd = { // start, last, nfd")
  148. for triple in ranges_nfd:
  149. out("{0x%06X, 0x%06X, 0x%06X}," % triple)
  150. out("};\n")