1
0

test-regex-partial.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. // Tests common_regex (esp. its partial final matches support).
  2. #include "common.h"
  3. #include "regex-partial.h"
  4. #include <sstream>
  5. #include <iostream>
  6. #include <optional>
  7. template <class T> static void assert_equals(const T & expected, const T & actual) {
  8. if (expected != actual) {
  9. std::cerr << "Expected: " << expected << std::endl;
  10. std::cerr << " Actual: " << actual << std::endl;
  11. std::cerr << std::flush;
  12. throw std::runtime_error("Test failed");
  13. }
  14. }
  15. struct test_case {
  16. std::string pattern;
  17. struct input_output {
  18. std::string input;
  19. common_regex_match output;
  20. };
  21. std::vector<input_output> inputs_outputs;
  22. };
  23. static std::string common_regex_match_type_name(common_regex_match_type type) {
  24. switch (type) {
  25. case COMMON_REGEX_MATCH_TYPE_NONE:
  26. return "COMMON_REGEX_MATCH_TYPE_NONE";
  27. case COMMON_REGEX_MATCH_TYPE_PARTIAL:
  28. return "COMMON_REGEX_MATCH_TYPE_PARTIAL";
  29. case COMMON_REGEX_MATCH_TYPE_FULL:
  30. return "COMMON_REGEX_MATCH_TYPE_FULL";
  31. }
  32. return "?";
  33. }
  34. static void test_regex() {
  35. printf("[%s]\n", __func__);
  36. auto test = [](const test_case & test_case) {
  37. common_regex cr(test_case.pattern);
  38. std::cout << "Testing pattern: /" << test_case.pattern << "/\n";
  39. // std::cout << " partial rev: " << cr.reversed_partial_pattern.str() << '\n';
  40. for (const auto & input_output : test_case.inputs_outputs) {
  41. std::cout << " Input: " << input_output.input << '\n';
  42. auto m = cr.search(input_output.input, 0);
  43. if (m != input_output.output) {
  44. auto match_to_str = [&](const std::optional<common_regex_match> & m) {
  45. std::ostringstream ss;
  46. if (m->type == COMMON_REGEX_MATCH_TYPE_NONE) {
  47. ss << "<no match>";
  48. } else {
  49. GGML_ASSERT(!input_output.output.groups.empty());
  50. std::vector<std::string> parts;
  51. for (const auto & g : m->groups) {
  52. parts.push_back("{" + std::to_string(g.begin) + ", " + std::to_string(g.end) + "}");
  53. }
  54. ss << "{" << common_regex_match_type_name(m->type) << ", {" << string_join(parts, ", ") << "}}";
  55. }
  56. return ss.str();
  57. };
  58. std::cout << " Expected: " << match_to_str(input_output.output) << '\n';
  59. std::cout << " Got: " << match_to_str(m) << '\n';
  60. std::cout << " Inverted pattern: /" << regex_to_reversed_partial_regex(test_case.pattern) << "/\n";
  61. throw std::runtime_error("Test failed");
  62. }
  63. }
  64. };
  65. test({
  66. "a",
  67. {
  68. {"a", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 1}}}},
  69. {"b", {COMMON_REGEX_MATCH_TYPE_NONE, {}}},
  70. {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 1}}}},
  71. {"ba", {COMMON_REGEX_MATCH_TYPE_FULL, {{1, 2}}}},
  72. }
  73. });
  74. test({
  75. "abcd",
  76. {
  77. {"abcd", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 4}}}},
  78. {"abcde", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 4}}}},
  79. {"abc", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
  80. {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
  81. {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
  82. {"d", {}},
  83. {"bcd", {}},
  84. {"cde", {}},
  85. {"cd", {}},
  86. {"yeah ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{5, 7}}}},
  87. {"abbie", {}},
  88. {"", {}},
  89. }
  90. });
  91. test({
  92. ".*?ab",
  93. {
  94. {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
  95. {"abc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
  96. {"dab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
  97. {"dabc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
  98. {"da", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
  99. {"d", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
  100. }
  101. });
  102. test({
  103. "a.*?b",
  104. {
  105. {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
  106. {"abc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
  107. {"a b", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
  108. {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
  109. {"argh", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 4}}}},
  110. {"d", {}},
  111. {"b", {}},
  112. }
  113. });
  114. test({
  115. "ab(?:cd){2,4}ef",
  116. {
  117. // {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, 0, {}}},
  118. {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
  119. {"abcd", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 4}}}},
  120. {"abcde", {}},
  121. {"abcdef", {}},
  122. {"abcdcd", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
  123. {"abcdcde", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 7}}}},
  124. {"abcdcdef", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 8}}}},
  125. {"abcdcdcdcdef", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 12}}}},
  126. {"abcdcdcdcdcdef", {}},
  127. {"abcde", {}},
  128. {"yea", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{2, 3}}}},
  129. }
  130. });
  131. test({
  132. "a(?:rte| pure )fact",
  133. {
  134. {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
  135. {"art", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
  136. {"artefa", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
  137. {"fact", {}},
  138. {"an arte", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{3, 7}}}},
  139. {"artefact", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 8}}}},
  140. {"an artefact", {COMMON_REGEX_MATCH_TYPE_FULL, {{3, 11}}}},
  141. {"a pure", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
  142. {"a pure fact", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 11}}}},
  143. {"it's a pure fact", {COMMON_REGEX_MATCH_TYPE_FULL, {{5, 16}}}},
  144. {"" , {}},
  145. {"pure", {}},
  146. {"pure fact", {}},
  147. }
  148. });
  149. test({
  150. "abc",
  151. {
  152. {" abcc", {COMMON_REGEX_MATCH_TYPE_FULL, {{1, 4}}}},
  153. {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
  154. {"abc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
  155. {" ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{1, 3}}}},
  156. {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
  157. {"b", {}},
  158. {"c", {}},
  159. {"", {}},
  160. }
  161. });
  162. test({
  163. "(?:abc)?\\s*def",
  164. {
  165. {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
  166. {"abc", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
  167. {"abc ", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 4}}}},
  168. {"abc d", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 5}}}},
  169. {"abc de", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
  170. {"abc def", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 7}}}},
  171. {"abc defg", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 7}}}},
  172. {"abc defgh", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 7}}}},
  173. {"abcde", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 5}}}},
  174. {"abcdefgh", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 6}}}},
  175. {" d", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
  176. {"def", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
  177. }
  178. });
  179. test({
  180. "a+b",
  181. {
  182. {"aaab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 4}}}},
  183. {"aaa", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
  184. {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
  185. }
  186. });
  187. test({
  188. "(?:"
  189. "(```(?:xml|json)?\\n\\s*)?" // match 1 (block_start)
  190. "(" // match 2 (open_tag)
  191. "<tool_call>"
  192. "|<function_call>"
  193. "|<tool>"
  194. "|<tools>"
  195. "|<response>"
  196. "|<json>"
  197. "|<xml>"
  198. "|<JSON>"
  199. ")?"
  200. "(\\s*\\{\\s*\"name\"\\s*:)" // match 3 (named tool call)
  201. ")"
  202. "|<function=([^>]+)>" // match 4 (function name)
  203. "|<function name=\"([^\"]+)\">", // match 5 (function name again)
  204. {
  205. {"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 8}, {54, 54}, {54, 54}, {0, 8}, {54, 54}, {54, 54}}}},
  206. {"<tool_call> {\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 18}}}},
  207. {"<tool_call>{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 17}}}},
  208. {"Let's call something\n<tool_call>{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{21, 38}}}},
  209. {"Ok then<tool_call>{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{7, 24}}}},
  210. {"{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
  211. {"Ok then{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{7, 13}}}},
  212. {"<tool_call> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 20}, {66, 66}, {0, 11}, {11, 20}, {66, 66}, {66, 66}}}},
  213. {"<function_call> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 24}, {70, 70}, {0, 15}, {15, 24}, {70, 70}, {70, 70}}}},
  214. {"<function name=\"special_function\"> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 34}, {89, 89}, {89, 89}, {89, 89}, {89, 89}, {16, 32}}}},
  215. {"<function=all>", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 14}, {14, 14}, {14, 14}, {14, 14}, {10, 13}, {14, 14}}}},
  216. }
  217. });
  218. }
  219. static void test_regex_to_reversed_partial_regex() {
  220. printf("[%s]\n", __func__);
  221. assert_equals<std::string>(
  222. "((?:(?:c)?b)?a)[\\s\\S]*",
  223. regex_to_reversed_partial_regex("abc"));
  224. assert_equals<std::string>(
  225. "(a+)[\\s\\S]*",
  226. regex_to_reversed_partial_regex("a+"));
  227. assert_equals<std::string>(
  228. "(a*)[\\s\\S]*",
  229. regex_to_reversed_partial_regex("a*"));
  230. assert_equals<std::string>(
  231. "(a?)[\\s\\S]*",
  232. regex_to_reversed_partial_regex("a?"));
  233. assert_equals<std::string>(
  234. "([a-z])[\\s\\S]*",
  235. regex_to_reversed_partial_regex("[a-z]"));
  236. assert_equals<std::string>(
  237. "((?:\\w+)?[a-z])[\\s\\S]*",
  238. regex_to_reversed_partial_regex("[a-z]\\w+"));
  239. assert_equals<std::string>(
  240. "((?:a|b))[\\s\\S]*",
  241. regex_to_reversed_partial_regex("(?:a|b)"));
  242. assert_equals<std::string>(
  243. "((?:(?:(?:d)?c)?b)?a)[\\s\\S]*",
  244. regex_to_reversed_partial_regex("abcd"));
  245. assert_equals<std::string>(
  246. "((?:b)?a*)[\\s\\S]*", // TODO: ((?:b)?a*+).* ??
  247. regex_to_reversed_partial_regex("a*b"));
  248. assert_equals<std::string>(
  249. "((?:(?:b)?a)?.*)[\\s\\S]*",
  250. regex_to_reversed_partial_regex(".*?ab"));
  251. assert_equals<std::string>(
  252. "((?:(?:b)?.*)?a)[\\s\\S]*",
  253. regex_to_reversed_partial_regex("a.*?b"));
  254. assert_equals<std::string>(
  255. "((?:(?:d)?(?:(?:c)?b))?a)[\\s\\S]*",
  256. regex_to_reversed_partial_regex("a(bc)d"));
  257. assert_equals<std::string>(
  258. "((?:(?:(?:c)?b|(?:e)?d))?a)[\\s\\S]*",
  259. regex_to_reversed_partial_regex("a(bc|de)"));
  260. assert_equals<std::string>(
  261. "((?:(?:(?:(?:(?:c)?b?)?b?)?b)?b)?a)[\\s\\S]*",
  262. regex_to_reversed_partial_regex("ab{2,4}c"));
  263. }
  264. int main() {
  265. test_regex_to_reversed_partial_regex();
  266. test_regex();
  267. std::cout << "All tests passed.\n";
  268. }