| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288 |
- // Tests common_regex (esp. its partial final matches support).
- #include "common.h"
- #include "regex-partial.h"
- #include <sstream>
- #include <iostream>
- #include <optional>
- template <class T> static void assert_equals(const T & expected, const T & actual) {
- if (expected != actual) {
- std::cerr << "Expected: " << expected << std::endl;
- std::cerr << " Actual: " << actual << std::endl;
- std::cerr << std::flush;
- throw std::runtime_error("Test failed");
- }
- }
- struct test_case {
- std::string pattern;
- struct input_output {
- std::string input;
- common_regex_match output;
- };
- std::vector<input_output> inputs_outputs;
- };
- static std::string common_regex_match_type_name(common_regex_match_type type) {
- switch (type) {
- case COMMON_REGEX_MATCH_TYPE_NONE:
- return "COMMON_REGEX_MATCH_TYPE_NONE";
- case COMMON_REGEX_MATCH_TYPE_PARTIAL:
- return "COMMON_REGEX_MATCH_TYPE_PARTIAL";
- case COMMON_REGEX_MATCH_TYPE_FULL:
- return "COMMON_REGEX_MATCH_TYPE_FULL";
- }
- return "?";
- }
- static void test_regex() {
- printf("[%s]\n", __func__);
- auto test = [](const test_case & test_case) {
- common_regex cr(test_case.pattern);
- std::cout << "Testing pattern: /" << test_case.pattern << "/\n";
- // std::cout << " partial rev: " << cr.reversed_partial_pattern.str() << '\n';
- for (const auto & input_output : test_case.inputs_outputs) {
- std::cout << " Input: " << input_output.input << '\n';
- auto m = cr.search(input_output.input, 0);
- if (m != input_output.output) {
- auto match_to_str = [&](const std::optional<common_regex_match> & m) {
- std::ostringstream ss;
- if (m->type == COMMON_REGEX_MATCH_TYPE_NONE) {
- ss << "<no match>";
- } else {
- GGML_ASSERT(!input_output.output.groups.empty());
- std::vector<std::string> parts;
- for (const auto & g : m->groups) {
- parts.push_back("{" + std::to_string(g.begin) + ", " + std::to_string(g.end) + "}");
- }
- ss << "{" << common_regex_match_type_name(m->type) << ", {" << string_join(parts, ", ") << "}}";
- }
- return ss.str();
- };
- std::cout << " Expected: " << match_to_str(input_output.output) << '\n';
- std::cout << " Got: " << match_to_str(m) << '\n';
- std::cout << " Inverted pattern: /" << regex_to_reversed_partial_regex(test_case.pattern) << "/\n";
- throw std::runtime_error("Test failed");
- }
- }
- };
- test({
- "a",
- {
- {"a", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 1}}}},
- {"b", {COMMON_REGEX_MATCH_TYPE_NONE, {}}},
- {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 1}}}},
- {"ba", {COMMON_REGEX_MATCH_TYPE_FULL, {{1, 2}}}},
- }
- });
- test({
- "abcd",
- {
- {"abcd", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 4}}}},
- {"abcde", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 4}}}},
- {"abc", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
- {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
- {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
- {"d", {}},
- {"bcd", {}},
- {"cde", {}},
- {"cd", {}},
- {"yeah ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{5, 7}}}},
- {"abbie", {}},
- {"", {}},
- }
- });
- test({
- ".*?ab",
- {
- {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
- {"abc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
- {"dab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
- {"dabc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
- {"da", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
- {"d", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
- }
- });
- test({
- "a.*?b",
- {
- {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
- {"abc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
- {"a b", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
- {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
- {"argh", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 4}}}},
- {"d", {}},
- {"b", {}},
- }
- });
- test({
- "ab(?:cd){2,4}ef",
- {
- // {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, 0, {}}},
- {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
- {"abcd", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 4}}}},
- {"abcde", {}},
- {"abcdef", {}},
- {"abcdcd", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
- {"abcdcde", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 7}}}},
- {"abcdcdef", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 8}}}},
- {"abcdcdcdcdef", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 12}}}},
- {"abcdcdcdcdcdef", {}},
- {"abcde", {}},
- {"yea", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{2, 3}}}},
- }
- });
- test({
- "a(?:rte| pure )fact",
- {
- {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
- {"art", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
- {"artefa", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
- {"fact", {}},
- {"an arte", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{3, 7}}}},
- {"artefact", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 8}}}},
- {"an artefact", {COMMON_REGEX_MATCH_TYPE_FULL, {{3, 11}}}},
- {"a pure", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
- {"a pure fact", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 11}}}},
- {"it's a pure fact", {COMMON_REGEX_MATCH_TYPE_FULL, {{5, 16}}}},
- {"" , {}},
- {"pure", {}},
- {"pure fact", {}},
- }
- });
- test({
- "abc",
- {
- {" abcc", {COMMON_REGEX_MATCH_TYPE_FULL, {{1, 4}}}},
- {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
- {"abc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
- {" ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{1, 3}}}},
- {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
- {"b", {}},
- {"c", {}},
- {"", {}},
- }
- });
- test({
- "(?:abc)?\\s*def",
- {
- {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
- {"abc", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
- {"abc ", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 4}}}},
- {"abc d", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 5}}}},
- {"abc de", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
- {"abc def", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 7}}}},
- {"abc defg", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 7}}}},
- {"abc defgh", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 7}}}},
- {"abcde", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 5}}}},
- {"abcdefgh", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 6}}}},
- {" d", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
- {"def", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
- }
- });
- test({
- "a+b",
- {
- {"aaab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 4}}}},
- {"aaa", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
- {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
- }
- });
- test({
- "(?:"
- "(```(?:xml|json)?\\n\\s*)?" // match 1 (block_start)
- "(" // match 2 (open_tag)
- "<tool_call>"
- "|<function_call>"
- "|<tool>"
- "|<tools>"
- "|<response>"
- "|<json>"
- "|<xml>"
- "|<JSON>"
- ")?"
- "(\\s*\\{\\s*\"name\"\\s*:)" // match 3 (named tool call)
- ")"
- "|<function=([^>]+)>" // match 4 (function name)
- "|<function name=\"([^\"]+)\">", // match 5 (function name again)
- {
- {"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 8}, {54, 54}, {54, 54}, {0, 8}, {54, 54}, {54, 54}}}},
- {"<tool_call> {\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 18}}}},
- {"<tool_call>{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 17}}}},
- {"Let's call something\n<tool_call>{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{21, 38}}}},
- {"Ok then<tool_call>{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{7, 24}}}},
- {"{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
- {"Ok then{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{7, 13}}}},
- {"<tool_call> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 20}, {66, 66}, {0, 11}, {11, 20}, {66, 66}, {66, 66}}}},
- {"<function_call> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 24}, {70, 70}, {0, 15}, {15, 24}, {70, 70}, {70, 70}}}},
- {"<function name=\"special_function\"> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 34}, {89, 89}, {89, 89}, {89, 89}, {89, 89}, {16, 32}}}},
- {"<function=all>", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 14}, {14, 14}, {14, 14}, {14, 14}, {10, 13}, {14, 14}}}},
- }
- });
- }
- static void test_regex_to_reversed_partial_regex() {
- printf("[%s]\n", __func__);
- assert_equals<std::string>(
- "((?:(?:c)?b)?a)[\\s\\S]*",
- regex_to_reversed_partial_regex("abc"));
- assert_equals<std::string>(
- "(a+)[\\s\\S]*",
- regex_to_reversed_partial_regex("a+"));
- assert_equals<std::string>(
- "(a*)[\\s\\S]*",
- regex_to_reversed_partial_regex("a*"));
- assert_equals<std::string>(
- "(a?)[\\s\\S]*",
- regex_to_reversed_partial_regex("a?"));
- assert_equals<std::string>(
- "([a-z])[\\s\\S]*",
- regex_to_reversed_partial_regex("[a-z]"));
- assert_equals<std::string>(
- "((?:\\w+)?[a-z])[\\s\\S]*",
- regex_to_reversed_partial_regex("[a-z]\\w+"));
- assert_equals<std::string>(
- "((?:a|b))[\\s\\S]*",
- regex_to_reversed_partial_regex("(?:a|b)"));
- assert_equals<std::string>(
- "((?:(?:(?:d)?c)?b)?a)[\\s\\S]*",
- regex_to_reversed_partial_regex("abcd"));
- assert_equals<std::string>(
- "((?:b)?a*)[\\s\\S]*", // TODO: ((?:b)?a*+).* ??
- regex_to_reversed_partial_regex("a*b"));
- assert_equals<std::string>(
- "((?:(?:b)?a)?.*)[\\s\\S]*",
- regex_to_reversed_partial_regex(".*?ab"));
- assert_equals<std::string>(
- "((?:(?:b)?.*)?a)[\\s\\S]*",
- regex_to_reversed_partial_regex("a.*?b"));
- assert_equals<std::string>(
- "((?:(?:d)?(?:(?:c)?b))?a)[\\s\\S]*",
- regex_to_reversed_partial_regex("a(bc)d"));
- assert_equals<std::string>(
- "((?:(?:(?:c)?b|(?:e)?d))?a)[\\s\\S]*",
- regex_to_reversed_partial_regex("a(bc|de)"));
- assert_equals<std::string>(
- "((?:(?:(?:(?:(?:c)?b?)?b?)?b)?b)?a)[\\s\\S]*",
- regex_to_reversed_partial_regex("ab{2,4}c"));
- }
- int main() {
- test_regex_to_reversed_partial_regex();
- test_regex();
- std::cout << "All tests passed.\n";
- }
|