test-unicode.cpp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449
  1. #include "tests.h"
  2. #include "peg-parser.h"
  3. #include <string>
  4. #include <sstream>
  5. #include <iomanip>
  6. #include <cctype>
  7. static void assert_result_equal(testing & t, common_peg_parse_result_type expected, common_peg_parse_result_type actual) {
  8. t.assert_equal(common_peg_parse_result_type_name(expected), common_peg_parse_result_type_name(actual));
  9. }
  10. static std::string hex_dump(const std::string& str) {
  11. std::ostringstream oss;
  12. for (unsigned char c : str) {
  13. if (std::isprint(c)) {
  14. oss << c;
  15. } else {
  16. oss << "\\x" << std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(c);
  17. }
  18. }
  19. return oss.str();
  20. }
  21. void test_unicode(testing &t) {
  22. struct test_case {
  23. std::string input;
  24. std::string expected_text;
  25. common_peg_parse_result_type expected_result;
  26. };
  27. t.test("any", [](testing &t) {
  28. std::vector<test_case> test_cases {
  29. // Valid UTF-8 sequences
  30. {"Hello", "Hello", COMMON_PEG_PARSE_RESULT_SUCCESS},
  31. {std::string("Caf\xC3\xA9"), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS},
  32. {std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
  33. {std::string("\xF0\x9F\x9A\x80"), std::string("\xF0\x9F\x9A\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
  34. // Incomplete UTF-8 sequences (partial bytes at end)
  35. {std::string("Caf\xC3"), "Caf", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
  36. {std::string("\xE4\xBD"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
  37. {std::string("\xF0\x9F\x9A"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
  38. // Invalid/malformed UTF-8 sequences
  39. {std::string("\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
  40. {std::string("Hello\x80World"), "Hello", COMMON_PEG_PARSE_RESULT_FAIL},
  41. {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
  42. };
  43. auto parser = build_peg_parser([](common_peg_parser_builder& p) {
  44. return p.sequence({p.one_or_more(p.any()), p.end()});
  45. });
  46. for (size_t i = 0; i < test_cases.size(); i++) {
  47. const auto & tc = test_cases[i];
  48. std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
  49. t.test(test_name, [&](testing &t) {
  50. common_peg_parse_context ctx(tc.input, true);
  51. auto result = parser.parse(ctx);
  52. // Assert result type matches
  53. assert_result_equal(t, tc.expected_result, result.type);
  54. // Assert matched text if success or need_more_input
  55. if (result.success() || result.need_more_input()) {
  56. std::string matched = tc.input.substr(result.start, result.end - result.start);
  57. t.assert_equal(tc.expected_text, matched);
  58. }
  59. });
  60. }
  61. });
  62. t.test("char classes", [](testing &t) {
  63. t.test("unicode range U+4E00-U+9FFF (CJK)", [](testing &t) {
  64. std::vector<test_case> test_cases {
  65. // Within range - CJK Unified Ideographs
  66. {std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00
  67. {std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60
  68. {std::string("\xE5\xA5\xBD"), std::string("\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+597D
  69. {std::string("\xE9\xBF\xBF"), std::string("\xE9\xBF\xBF"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+9FFF
  70. // Outside range - should fail
  71. {"a", "", COMMON_PEG_PARSE_RESULT_FAIL}, // ASCII
  72. {std::string("\xE4\xB7\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+4DFF (before range)
  73. {std::string("\xEA\x80\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+A000 (after range)
  74. // Incomplete sequences in range
  75. {std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete U+4E00
  76. {std::string("\xE5\xA5"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete U+597D
  77. };
  78. auto parser = build_peg_parser([](common_peg_parser_builder& p) {
  79. return p.sequence({p.chars(R"([\u4E00-\u9FFF])"), p.end()});
  80. });
  81. for (size_t i = 0; i < test_cases.size(); i++) {
  82. const auto & tc = test_cases[i];
  83. std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
  84. t.test(test_name, [&](testing &t) {
  85. common_peg_parse_context ctx(tc.input, true);
  86. auto result = parser.parse(ctx);
  87. // Assert result type matches
  88. assert_result_equal(t, tc.expected_result, result.type);
  89. // Assert matched text if success or need_more_input
  90. if (result.success() || result.need_more_input()) {
  91. std::string matched = tc.input.substr(result.start, result.end - result.start);
  92. t.assert_equal(tc.expected_text, matched);
  93. }
  94. });
  95. }
  96. });
  97. t.test("unicode range U+1F600-U+1F64F (emoticons)", [](testing &t) {
  98. std::vector<test_case> test_cases {
  99. // Within range - Emoticons (all 4-byte UTF-8)
  100. {std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600
  101. {std::string("\xF0\x9F\x98\x81"), std::string("\xF0\x9F\x98\x81"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F601
  102. {std::string("\xF0\x9F\x99\x8F"), std::string("\xF0\x9F\x99\x8F"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F64F
  103. // Outside range
  104. {std::string("\xF0\x9F\x97\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F5FF (before range)
  105. {std::string("\xF0\x9F\x99\x90"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F650 (after range)
  106. {std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680 (outside range)
  107. // Incomplete sequences
  108. {std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete emoji
  109. {std::string("\xF0\x9F"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Very incomplete
  110. };
  111. auto parser = build_peg_parser([](common_peg_parser_builder& p) {
  112. return p.sequence({p.chars(R"([\U0001F600-\U0001F64F])"), p.end()});
  113. });
  114. for (size_t i = 0; i < test_cases.size(); i++) {
  115. const auto & tc = test_cases[i];
  116. std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
  117. t.test(test_name, [&](testing &t) {
  118. common_peg_parse_context ctx(tc.input, true);
  119. auto result = parser.parse(ctx);
  120. // Assert result type matches
  121. assert_result_equal(t, tc.expected_result, result.type);
  122. // Assert matched text if success or need_more_input
  123. if (result.success() || result.need_more_input()) {
  124. std::string matched = tc.input.substr(result.start, result.end - result.start);
  125. t.assert_equal(tc.expected_text, matched);
  126. }
  127. });
  128. }
  129. });
  130. t.test("mixed unicode ranges", [](testing &t) {
  131. std::vector<test_case> test_cases {
  132. // Match CJK
  133. {std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00
  134. {std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60
  135. // Match emoticons
  136. {std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600
  137. // Match ASCII digits
  138. {"5", "5", COMMON_PEG_PARSE_RESULT_SUCCESS},
  139. // Don't match outside any range
  140. {"a", "", COMMON_PEG_PARSE_RESULT_FAIL},
  141. {std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680
  142. // Incomplete
  143. {std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
  144. {std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
  145. };
  146. auto parser = build_peg_parser([](common_peg_parser_builder& p) {
  147. return p.sequence({p.chars(R"([\u4E00-\u9FFF\U0001F600-\U0001F64F0-9])"), p.end()});
  148. });
  149. for (size_t i = 0; i < test_cases.size(); i++) {
  150. const auto & tc = test_cases[i];
  151. std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
  152. t.test(test_name, [&](testing &t) {
  153. common_peg_parse_context ctx(tc.input, true);
  154. auto result = parser.parse(ctx);
  155. // Assert result type matches
  156. assert_result_equal(t, tc.expected_result, result.type);
  157. // Assert matched text if success or need_more_input
  158. if (result.success() || result.need_more_input()) {
  159. std::string matched = tc.input.substr(result.start, result.end - result.start);
  160. t.assert_equal(tc.expected_text, matched);
  161. }
  162. });
  163. }
  164. });
  165. });
  166. t.test("until parser", [](testing &t) {
  167. t.test("ASCII delimiter with Unicode content", [](testing &t) {
  168. std::vector<test_case> test_cases {
  169. // CJK characters before delimiter
  170. {std::string("\xE4\xBD\xA0\xE5\xA5\xBD</tag>"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
  171. // Emoji before delimiter
  172. {std::string("\xF0\x9F\x98\x80</tag>"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
  173. // Mixed content
  174. {std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!</tag>"), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS},
  175. };
  176. auto parser = build_peg_parser([](common_peg_parser_builder& p) {
  177. return p.until("</tag>");
  178. });
  179. for (size_t i = 0; i < test_cases.size(); i++) {
  180. const auto & tc = test_cases[i];
  181. std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
  182. t.test(test_name, [&](testing &t) {
  183. common_peg_parse_context ctx(tc.input, false);
  184. auto result = parser.parse(ctx);
  185. assert_result_equal(t, tc.expected_result, result.type);
  186. if (result.success()) {
  187. std::string matched = tc.input.substr(result.start, result.end - result.start);
  188. t.assert_equal(tc.expected_text, matched);
  189. }
  190. });
  191. }
  192. });
  193. t.test("incomplete UTF-8 at end", [](testing &t) {
  194. std::vector<test_case> test_cases {
  195. // Incomplete emoji at end, no delimiter
  196. {std::string("content\xF0\x9F\x98"), std::string("content"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
  197. // Incomplete CJK at end, no delimiter
  198. {std::string("hello\xE4\xB8"), std::string("hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
  199. // Complete content, no delimiter (should consume all valid UTF-8)
  200. {std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
  201. };
  202. auto parser = build_peg_parser([](common_peg_parser_builder& p) {
  203. return p.until("</tag>");
  204. });
  205. for (size_t i = 0; i < test_cases.size(); i++) {
  206. const auto & tc = test_cases[i];
  207. std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
  208. t.test(test_name, [&](testing &t) {
  209. common_peg_parse_context ctx(tc.input, true);
  210. auto result = parser.parse(ctx);
  211. assert_result_equal(t, tc.expected_result, result.type);
  212. if (result.success() || result.need_more_input()) {
  213. std::string matched = tc.input.substr(result.start, result.end - result.start);
  214. t.assert_equal(tc.expected_text, matched);
  215. }
  216. });
  217. }
  218. });
  219. t.test("malformed UTF-8", [](testing &t) {
  220. std::vector<test_case> test_cases {
  221. // Invalid UTF-8 bytes
  222. {std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
  223. // Continuation byte without lead byte
  224. {std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL},
  225. // Invalid continuation byte
  226. {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
  227. };
  228. auto parser = build_peg_parser([](common_peg_parser_builder& p) {
  229. return p.until("</tag>");
  230. });
  231. for (size_t i = 0; i < test_cases.size(); i++) {
  232. const auto & tc = test_cases[i];
  233. std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
  234. t.test(test_name, [&](testing &t) {
  235. common_peg_parse_context ctx(tc.input, false);
  236. auto result = parser.parse(ctx);
  237. assert_result_equal(t, tc.expected_result, result.type);
  238. });
  239. }
  240. });
  241. });
  242. t.test("json_string parser", [](testing &t) {
  243. t.test("valid UTF-8 characters", [](testing &t) {
  244. std::vector<test_case> test_cases {
  245. // ASCII only
  246. {"Hello World\"", "Hello World", COMMON_PEG_PARSE_RESULT_SUCCESS},
  247. // 2-byte UTF-8 (accented characters)
  248. {std::string("Caf\xC3\xA9\""), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS},
  249. // 3-byte UTF-8 (CJK)
  250. {std::string("\xE4\xBD\xA0\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
  251. // 4-byte UTF-8 (emoji)
  252. {std::string("\xF0\x9F\x98\x80\""), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
  253. // Mixed content
  254. {std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!\""), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS},
  255. };
  256. for (size_t i = 0; i < test_cases.size(); i++) {
  257. const auto & tc = test_cases[i];
  258. std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
  259. t.test(test_name, [&](testing &t) {
  260. auto parser = build_peg_parser([](common_peg_parser_builder& p) {
  261. return p.sequence({p.json_string_content(), p.literal("\"")});
  262. });
  263. common_peg_parse_context ctx(tc.input, false);
  264. auto result = parser.parse(ctx);
  265. assert_result_equal(t, tc.expected_result, result.type);
  266. if (result.success()) {
  267. std::string matched = tc.input.substr(result.start, result.end - result.start - 1); // -1 to exclude closing quote
  268. t.assert_equal(tc.expected_text, matched);
  269. }
  270. });
  271. }
  272. });
  273. t.test("incomplete UTF-8", [](testing &t) {
  274. std::vector<test_case> test_cases {
  275. // Incomplete 2-byte sequence
  276. {std::string("Caf\xC3"), std::string("Caf"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
  277. // Incomplete 3-byte sequence
  278. {std::string("Hello\xE4\xB8"), std::string("Hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
  279. // Incomplete 4-byte sequence
  280. {std::string("Text\xF0\x9F\x98"), std::string("Text"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
  281. // Incomplete at very start
  282. {std::string("\xE4\xBD"), std::string(""), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
  283. };
  284. for (size_t i = 0; i < test_cases.size(); i++) {
  285. const auto & tc = test_cases[i];
  286. std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
  287. t.test(test_name, [&](testing &t) {
  288. auto parser = build_peg_parser([](common_peg_parser_builder& p) {
  289. return p.json_string_content();
  290. });
  291. common_peg_parse_context ctx(tc.input, true);
  292. auto result = parser.parse(ctx);
  293. assert_result_equal(t, tc.expected_result, result.type);
  294. if (result.need_more_input()) {
  295. std::string matched = tc.input.substr(result.start, result.end - result.start);
  296. t.assert_equal(tc.expected_text, matched);
  297. }
  298. });
  299. }
  300. });
  301. t.test("malformed UTF-8", [](testing &t) {
  302. std::vector<test_case> test_cases {
  303. // Invalid UTF-8 bytes
  304. {std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
  305. // Continuation byte without lead byte
  306. {std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL},
  307. // Invalid continuation byte
  308. {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
  309. // Overlong encoding (security issue)
  310. {std::string("\xC0\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL},
  311. };
  312. for (size_t i = 0; i < test_cases.size(); i++) {
  313. const auto & tc = test_cases[i];
  314. std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
  315. t.test(test_name, [&](testing &t) {
  316. auto parser = build_peg_parser([](common_peg_parser_builder& p) {
  317. return p.json_string_content();
  318. });
  319. common_peg_parse_context ctx(tc.input, false);
  320. auto result = parser.parse(ctx);
  321. assert_result_equal(t, tc.expected_result, result.type);
  322. });
  323. }
  324. });
  325. t.test("escape sequences with UTF-8", [](testing &t) {
  326. std::vector<test_case> test_cases {
  327. // Unicode escape sequence
  328. {"Hello\\u0041\"", "Hello\\u0041", COMMON_PEG_PARSE_RESULT_SUCCESS},
  329. // Mix of UTF-8 and escape sequences
  330. {std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
  331. // Escaped quote in UTF-8 string
  332. {std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
  333. };
  334. for (size_t i = 0; i < test_cases.size(); i++) {
  335. const auto & tc = test_cases[i];
  336. std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
  337. t.test(test_name, [&](testing &t) {
  338. auto parser = build_peg_parser([](common_peg_parser_builder& p) {
  339. return p.sequence({p.json_string_content(), p.literal("\"")});
  340. });
  341. common_peg_parse_context ctx(tc.input, false);
  342. auto result = parser.parse(ctx);
  343. assert_result_equal(t, tc.expected_result, result.type);
  344. if (result.success()) {
  345. std::string matched = tc.input.substr(result.start, result.end - result.start - 1); // -1 to exclude closing quote
  346. t.assert_equal(tc.expected_text, matched);
  347. }
  348. });
  349. }
  350. });
  351. });
  352. }