| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449 |
- #include "tests.h"
- #include "peg-parser.h"
- #include <string>
- #include <sstream>
- #include <iomanip>
- #include <cctype>
- static void assert_result_equal(testing & t, common_peg_parse_result_type expected, common_peg_parse_result_type actual) {
- t.assert_equal(common_peg_parse_result_type_name(expected), common_peg_parse_result_type_name(actual));
- }
- static std::string hex_dump(const std::string& str) {
- std::ostringstream oss;
- for (unsigned char c : str) {
- if (std::isprint(c)) {
- oss << c;
- } else {
- oss << "\\x" << std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(c);
- }
- }
- return oss.str();
- }
- void test_unicode(testing &t) {
- struct test_case {
- std::string input;
- std::string expected_text;
- common_peg_parse_result_type expected_result;
- };
- t.test("any", [](testing &t) {
- std::vector<test_case> test_cases {
- // Valid UTF-8 sequences
- {"Hello", "Hello", COMMON_PEG_PARSE_RESULT_SUCCESS},
- {std::string("Caf\xC3\xA9"), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS},
- {std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
- {std::string("\xF0\x9F\x9A\x80"), std::string("\xF0\x9F\x9A\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
- // Incomplete UTF-8 sequences (partial bytes at end)
- {std::string("Caf\xC3"), "Caf", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
- {std::string("\xE4\xBD"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
- {std::string("\xF0\x9F\x9A"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
- // Invalid/malformed UTF-8 sequences
- {std::string("\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
- {std::string("Hello\x80World"), "Hello", COMMON_PEG_PARSE_RESULT_FAIL},
- {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
- };
- auto parser = build_peg_parser([](common_peg_parser_builder& p) {
- return p.sequence({p.one_or_more(p.any()), p.end()});
- });
- for (size_t i = 0; i < test_cases.size(); i++) {
- const auto & tc = test_cases[i];
- std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
- t.test(test_name, [&](testing &t) {
- common_peg_parse_context ctx(tc.input, true);
- auto result = parser.parse(ctx);
- // Assert result type matches
- assert_result_equal(t, tc.expected_result, result.type);
- // Assert matched text if success or need_more_input
- if (result.success() || result.need_more_input()) {
- std::string matched = tc.input.substr(result.start, result.end - result.start);
- t.assert_equal(tc.expected_text, matched);
- }
- });
- }
- });
- t.test("char classes", [](testing &t) {
- t.test("unicode range U+4E00-U+9FFF (CJK)", [](testing &t) {
- std::vector<test_case> test_cases {
- // Within range - CJK Unified Ideographs
- {std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00
- {std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60
- {std::string("\xE5\xA5\xBD"), std::string("\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+597D
- {std::string("\xE9\xBF\xBF"), std::string("\xE9\xBF\xBF"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+9FFF
- // Outside range - should fail
- {"a", "", COMMON_PEG_PARSE_RESULT_FAIL}, // ASCII
- {std::string("\xE4\xB7\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+4DFF (before range)
- {std::string("\xEA\x80\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+A000 (after range)
- // Incomplete sequences in range
- {std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete U+4E00
- {std::string("\xE5\xA5"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete U+597D
- };
- auto parser = build_peg_parser([](common_peg_parser_builder& p) {
- return p.sequence({p.chars(R"([\u4E00-\u9FFF])"), p.end()});
- });
- for (size_t i = 0; i < test_cases.size(); i++) {
- const auto & tc = test_cases[i];
- std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
- t.test(test_name, [&](testing &t) {
- common_peg_parse_context ctx(tc.input, true);
- auto result = parser.parse(ctx);
- // Assert result type matches
- assert_result_equal(t, tc.expected_result, result.type);
- // Assert matched text if success or need_more_input
- if (result.success() || result.need_more_input()) {
- std::string matched = tc.input.substr(result.start, result.end - result.start);
- t.assert_equal(tc.expected_text, matched);
- }
- });
- }
- });
- t.test("unicode range U+1F600-U+1F64F (emoticons)", [](testing &t) {
- std::vector<test_case> test_cases {
- // Within range - Emoticons (all 4-byte UTF-8)
- {std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600
- {std::string("\xF0\x9F\x98\x81"), std::string("\xF0\x9F\x98\x81"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F601
- {std::string("\xF0\x9F\x99\x8F"), std::string("\xF0\x9F\x99\x8F"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F64F
- // Outside range
- {std::string("\xF0\x9F\x97\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F5FF (before range)
- {std::string("\xF0\x9F\x99\x90"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F650 (after range)
- {std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680 (outside range)
- // Incomplete sequences
- {std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete emoji
- {std::string("\xF0\x9F"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Very incomplete
- };
- auto parser = build_peg_parser([](common_peg_parser_builder& p) {
- return p.sequence({p.chars(R"([\U0001F600-\U0001F64F])"), p.end()});
- });
- for (size_t i = 0; i < test_cases.size(); i++) {
- const auto & tc = test_cases[i];
- std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
- t.test(test_name, [&](testing &t) {
- common_peg_parse_context ctx(tc.input, true);
- auto result = parser.parse(ctx);
- // Assert result type matches
- assert_result_equal(t, tc.expected_result, result.type);
- // Assert matched text if success or need_more_input
- if (result.success() || result.need_more_input()) {
- std::string matched = tc.input.substr(result.start, result.end - result.start);
- t.assert_equal(tc.expected_text, matched);
- }
- });
- }
- });
- t.test("mixed unicode ranges", [](testing &t) {
- std::vector<test_case> test_cases {
- // Match CJK
- {std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00
- {std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60
- // Match emoticons
- {std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600
- // Match ASCII digits
- {"5", "5", COMMON_PEG_PARSE_RESULT_SUCCESS},
- // Don't match outside any range
- {"a", "", COMMON_PEG_PARSE_RESULT_FAIL},
- {std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680
- // Incomplete
- {std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
- {std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
- };
- auto parser = build_peg_parser([](common_peg_parser_builder& p) {
- return p.sequence({p.chars(R"([\u4E00-\u9FFF\U0001F600-\U0001F64F0-9])"), p.end()});
- });
- for (size_t i = 0; i < test_cases.size(); i++) {
- const auto & tc = test_cases[i];
- std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
- t.test(test_name, [&](testing &t) {
- common_peg_parse_context ctx(tc.input, true);
- auto result = parser.parse(ctx);
- // Assert result type matches
- assert_result_equal(t, tc.expected_result, result.type);
- // Assert matched text if success or need_more_input
- if (result.success() || result.need_more_input()) {
- std::string matched = tc.input.substr(result.start, result.end - result.start);
- t.assert_equal(tc.expected_text, matched);
- }
- });
- }
- });
- });
- t.test("until parser", [](testing &t) {
- t.test("ASCII delimiter with Unicode content", [](testing &t) {
- std::vector<test_case> test_cases {
- // CJK characters before delimiter
- {std::string("\xE4\xBD\xA0\xE5\xA5\xBD</tag>"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
- // Emoji before delimiter
- {std::string("\xF0\x9F\x98\x80</tag>"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
- // Mixed content
- {std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!</tag>"), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS},
- };
- auto parser = build_peg_parser([](common_peg_parser_builder& p) {
- return p.until("</tag>");
- });
- for (size_t i = 0; i < test_cases.size(); i++) {
- const auto & tc = test_cases[i];
- std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
- t.test(test_name, [&](testing &t) {
- common_peg_parse_context ctx(tc.input, false);
- auto result = parser.parse(ctx);
- assert_result_equal(t, tc.expected_result, result.type);
- if (result.success()) {
- std::string matched = tc.input.substr(result.start, result.end - result.start);
- t.assert_equal(tc.expected_text, matched);
- }
- });
- }
- });
- t.test("incomplete UTF-8 at end", [](testing &t) {
- std::vector<test_case> test_cases {
- // Incomplete emoji at end, no delimiter
- {std::string("content\xF0\x9F\x98"), std::string("content"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
- // Incomplete CJK at end, no delimiter
- {std::string("hello\xE4\xB8"), std::string("hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
- // Complete content, no delimiter (should consume all valid UTF-8)
- {std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
- };
- auto parser = build_peg_parser([](common_peg_parser_builder& p) {
- return p.until("</tag>");
- });
- for (size_t i = 0; i < test_cases.size(); i++) {
- const auto & tc = test_cases[i];
- std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
- t.test(test_name, [&](testing &t) {
- common_peg_parse_context ctx(tc.input, true);
- auto result = parser.parse(ctx);
- assert_result_equal(t, tc.expected_result, result.type);
- if (result.success() || result.need_more_input()) {
- std::string matched = tc.input.substr(result.start, result.end - result.start);
- t.assert_equal(tc.expected_text, matched);
- }
- });
- }
- });
- t.test("malformed UTF-8", [](testing &t) {
- std::vector<test_case> test_cases {
- // Invalid UTF-8 bytes
- {std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
- // Continuation byte without lead byte
- {std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL},
- // Invalid continuation byte
- {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
- };
- auto parser = build_peg_parser([](common_peg_parser_builder& p) {
- return p.until("</tag>");
- });
- for (size_t i = 0; i < test_cases.size(); i++) {
- const auto & tc = test_cases[i];
- std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
- t.test(test_name, [&](testing &t) {
- common_peg_parse_context ctx(tc.input, false);
- auto result = parser.parse(ctx);
- assert_result_equal(t, tc.expected_result, result.type);
- });
- }
- });
- });
- t.test("json_string parser", [](testing &t) {
- t.test("valid UTF-8 characters", [](testing &t) {
- std::vector<test_case> test_cases {
- // ASCII only
- {"Hello World\"", "Hello World", COMMON_PEG_PARSE_RESULT_SUCCESS},
- // 2-byte UTF-8 (accented characters)
- {std::string("Caf\xC3\xA9\""), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS},
- // 3-byte UTF-8 (CJK)
- {std::string("\xE4\xBD\xA0\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
- // 4-byte UTF-8 (emoji)
- {std::string("\xF0\x9F\x98\x80\""), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
- // Mixed content
- {std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!\""), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS},
- };
- for (size_t i = 0; i < test_cases.size(); i++) {
- const auto & tc = test_cases[i];
- std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
- t.test(test_name, [&](testing &t) {
- auto parser = build_peg_parser([](common_peg_parser_builder& p) {
- return p.sequence({p.json_string_content(), p.literal("\"")});
- });
- common_peg_parse_context ctx(tc.input, false);
- auto result = parser.parse(ctx);
- assert_result_equal(t, tc.expected_result, result.type);
- if (result.success()) {
- std::string matched = tc.input.substr(result.start, result.end - result.start - 1); // -1 to exclude closing quote
- t.assert_equal(tc.expected_text, matched);
- }
- });
- }
- });
- t.test("incomplete UTF-8", [](testing &t) {
- std::vector<test_case> test_cases {
- // Incomplete 2-byte sequence
- {std::string("Caf\xC3"), std::string("Caf"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
- // Incomplete 3-byte sequence
- {std::string("Hello\xE4\xB8"), std::string("Hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
- // Incomplete 4-byte sequence
- {std::string("Text\xF0\x9F\x98"), std::string("Text"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
- // Incomplete at very start
- {std::string("\xE4\xBD"), std::string(""), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
- };
- for (size_t i = 0; i < test_cases.size(); i++) {
- const auto & tc = test_cases[i];
- std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
- t.test(test_name, [&](testing &t) {
- auto parser = build_peg_parser([](common_peg_parser_builder& p) {
- return p.json_string_content();
- });
- common_peg_parse_context ctx(tc.input, true);
- auto result = parser.parse(ctx);
- assert_result_equal(t, tc.expected_result, result.type);
- if (result.need_more_input()) {
- std::string matched = tc.input.substr(result.start, result.end - result.start);
- t.assert_equal(tc.expected_text, matched);
- }
- });
- }
- });
- t.test("malformed UTF-8", [](testing &t) {
- std::vector<test_case> test_cases {
- // Invalid UTF-8 bytes
- {std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
- // Continuation byte without lead byte
- {std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL},
- // Invalid continuation byte
- {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
- // Overlong encoding (security issue)
- {std::string("\xC0\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL},
- };
- for (size_t i = 0; i < test_cases.size(); i++) {
- const auto & tc = test_cases[i];
- std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
- t.test(test_name, [&](testing &t) {
- auto parser = build_peg_parser([](common_peg_parser_builder& p) {
- return p.json_string_content();
- });
- common_peg_parse_context ctx(tc.input, false);
- auto result = parser.parse(ctx);
- assert_result_equal(t, tc.expected_result, result.type);
- });
- }
- });
- t.test("escape sequences with UTF-8", [](testing &t) {
- std::vector<test_case> test_cases {
- // Unicode escape sequence
- {"Hello\\u0041\"", "Hello\\u0041", COMMON_PEG_PARSE_RESULT_SUCCESS},
- // Mix of UTF-8 and escape sequences
- {std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
- // Escaped quote in UTF-8 string
- {std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
- };
- for (size_t i = 0; i < test_cases.size(); i++) {
- const auto & tc = test_cases[i];
- std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
- t.test(test_name, [&](testing &t) {
- auto parser = build_peg_parser([](common_peg_parser_builder& p) {
- return p.sequence({p.json_string_content(), p.literal("\"")});
- });
- common_peg_parse_context ctx(tc.input, false);
- auto result = parser.parse(ctx);
- assert_result_equal(t, tc.expected_result, result.type);
- if (result.success()) {
- std::string matched = tc.input.substr(result.start, result.end - result.start - 1); // -1 to exclude closing quote
- t.assert_equal(tc.expected_text, matched);
- }
- });
- }
- });
- });
- }
|