regex-partial.cpp 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. #include "regex-partial.h"
  2. #include "common.h"
  3. #include <functional>
  4. #include <optional>
  5. common_regex::common_regex(const std::string & pattern) :
  6. pattern(pattern),
  7. rx(pattern),
  8. rx_reversed_partial(regex_to_reversed_partial_regex(pattern)) {}
  9. common_regex_match common_regex::search(const std::string & input, size_t pos, bool as_match) const {
  10. std::smatch match;
  11. if (pos > input.size()) {
  12. throw std::runtime_error("Position out of bounds");
  13. }
  14. auto start = input.begin() + pos;
  15. auto found = as_match
  16. ? std::regex_match(start, input.end(), match, rx)
  17. : std::regex_search(start, input.end(), match, rx);
  18. if (found) {
  19. common_regex_match res;
  20. res.type = COMMON_REGEX_MATCH_TYPE_FULL;
  21. for (size_t i = 0; i < match.size(); ++i) {
  22. auto begin = pos + match.position(i);
  23. res.groups.emplace_back(begin, begin + match.length(i));
  24. }
  25. return res;
  26. }
  27. std::match_results<std::string::const_reverse_iterator> srmatch;
  28. if (std::regex_search(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial, std::regex_constants::match_continuous)) {
  29. auto group = srmatch[1].str();
  30. if (group.length() != 0) {
  31. auto it = srmatch[1].second.base();
  32. // auto position = static_cast<size_t>(std::distance(input.begin(), it));
  33. if ((!as_match) || it == input.begin()) {
  34. common_regex_match res;
  35. res.type = COMMON_REGEX_MATCH_TYPE_PARTIAL;
  36. const size_t begin = std::distance(input.begin(), it);
  37. const size_t end = input.size();
  38. if (begin == std::string::npos || end == std::string::npos || begin > end) {
  39. throw std::runtime_error("Invalid range");
  40. }
  41. res.groups.push_back({begin, end});
  42. return res;
  43. }
  44. }
  45. }
  46. return {};
  47. }
  48. /*
  49. Transforms a regex pattern to a partial match pattern that operates on a reversed input string to find partial final matches of the original pattern.
  50. Ideally we'd like to use boost::match_partial (https://beta.boost.org/doc/libs/1_59_0/libs/regex/doc/html/boost_regex/partial_matches.html)
  51. to see if a string ends with a partial regex match, but but it's not in std::regex yet.
  52. Instead, we'll the regex into a partial match regex operating as a full match on the reverse iterators of the input.
  53. - /abcd/ -> ^(dcba|cba|ba|a) -> ^((?:(?:(?:(?:d)?c)?b)?a)
  54. - /a|b/ -> ^(a|b)
  55. - /a*?/ -> error, could match ""
  56. - /a*b/ -> ^((?:b)?a*+) (final repetitions become eager)
  57. - /.*?ab/ -> ^((?:b)?a) (omit .*)
  58. - /a.*?b/ -> ^((?:b)?.*?a) (keep reluctant matches)
  59. - /a(bc)d/ -> ^((?:(?:d)?(?:(?:c)?b))?a)
  60. - /a(bc|de)/ -> ^((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a)
  61. - /ab{2,4}c/ -> ^cbbb?b?a -> ^((?:(?:(?:(?:(?:c)?b)?b)?b?)?b?)?a)
  62. The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern.
  63. All other groups are turned into non-capturing groups, and reluctant quantifiers are ignored.
  64. */
  65. std::string regex_to_reversed_partial_regex(const std::string & pattern) {
  66. auto it = pattern.begin();
  67. const auto end = pattern.end();
  68. std::function<std::string()> process = [&]() {
  69. std::vector<std::vector<std::string>> alternatives(1);
  70. std::vector<std::string> * sequence = &alternatives.back();
  71. while (it != end) {
  72. if (*it == '[') {
  73. auto start = it;
  74. ++it;
  75. while (it != end) {
  76. if ((*it == '\\') && (++it != end)) {
  77. ++it;
  78. } else if ((it != end) && (*it == ']')) {
  79. break;
  80. } else {
  81. ++it;
  82. }
  83. }
  84. if (it == end) {
  85. throw std::runtime_error("Unmatched '[' in pattern");
  86. }
  87. ++it;
  88. sequence->push_back(std::string(start, it));
  89. } else if (*it == '*' || *it == '?' || *it == '+') {
  90. if (sequence->empty()) {
  91. throw std::runtime_error("Quantifier without preceding element");
  92. }
  93. sequence->back() += *it;
  94. auto is_star = *it == '*';
  95. ++it;
  96. if (is_star) {
  97. if (*it == '?') {
  98. ++it;
  99. }
  100. }
  101. } else if (*it == '{') {
  102. if (sequence->empty()) {
  103. throw std::runtime_error("Repetition without preceding element");
  104. }
  105. ++it;
  106. auto start = it;
  107. while (it != end && *it != '}') {
  108. ++it;
  109. }
  110. if (it == end) {
  111. throw std::runtime_error("Unmatched '{' in pattern");
  112. }
  113. auto parts = string_split(std::string(start, it), ",");
  114. ++it;
  115. if (parts.size() > 2) {
  116. throw std::runtime_error("Invalid repetition range in pattern");
  117. }
  118. auto parseOptInt = [&](const std::string & s, const std::optional<int> & def = std::nullopt) -> std::optional<int> {
  119. if (s.empty()) {
  120. return def;
  121. }
  122. return std::stoi(s);
  123. };
  124. auto min = parseOptInt(parts[0], 0);
  125. auto max = parts.size() == 1 ? min : parseOptInt(parts[1]);
  126. if (min && max && *max < *min) {
  127. throw std::runtime_error("Invalid repetition range in pattern");
  128. }
  129. // Brutal but... let's repeat at least min times, then ? for the delta between min & max (or * for unbounded)
  130. auto part = sequence->back();
  131. sequence->pop_back();
  132. for (int i = 0; i < *min; i++) {
  133. sequence->push_back(part);
  134. }
  135. if (max) {
  136. for (int i = *min; i < *max; i++) {
  137. sequence->push_back(part + "?");
  138. }
  139. } else {
  140. sequence->push_back(part + "*");
  141. }
  142. } else if (*it == '(') {
  143. ++it;
  144. if (it != end && *it == '?' && (it + 1 != end) && *(it + 1) == ':') {
  145. it += 2;
  146. }
  147. auto sub = process();
  148. if (*it != ')') {
  149. throw std::runtime_error("Unmatched '(' in pattern");
  150. }
  151. ++it;
  152. auto & part = sequence->emplace_back("(?:");
  153. part += sub;
  154. part += ")";
  155. } else if (*it == ')') {
  156. break;
  157. } else if (*it == '|') {
  158. ++it;
  159. alternatives.emplace_back();
  160. sequence = &alternatives.back();
  161. } else if (*it == '\\' && (++it != end)) {
  162. auto str = std::string("\\") + *it;
  163. sequence->push_back(str);
  164. ++it;
  165. } else if (it != end) {
  166. sequence->push_back(std::string(1, *it));
  167. ++it;
  168. }
  169. }
  170. // /abcd/ -> ^(dcba|cba|ba|a) -> ^((?:(?:(?:d)?c)?b)?a)
  171. // if n(=4) parts, opening n-1(=3) non-capturing groups after the 1 capturing group
  172. // We'll do the outermost capturing group and final .* in the enclosing function.
  173. std::vector<std::string> res_alts;
  174. for (const auto & parts : alternatives) {
  175. auto & res = res_alts.emplace_back();
  176. for (size_t i = 0; i < parts.size() - 1; i++) {
  177. res += "(?:";
  178. }
  179. for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
  180. res += *it;
  181. if (it != parts.rend() - 1) {
  182. res += ")?";
  183. }
  184. }
  185. }
  186. return string_join(res_alts, "|");
  187. };
  188. auto res = process();
  189. if (it != end) {
  190. throw std::runtime_error("Unmatched '(' in pattern");
  191. }
  192. return "^(" + res + ")";
  193. }